diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 19316c52d12ce..cff870cc905a7 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -581,7 +581,7 @@ if (COMPILER_RT_HAS_AARCH64_SME)
     set_source_files_properties(aarch64/arm_apple_sme_abi.s PROPERTIES COMPILE_FLAGS -march=armv8a+sme)
     message(STATUS "AArch64 Apple SME ABI routines enabled")
   elseif (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND COMPILER_RT_AARCH64_FMV_USES_GLOBAL_CONSTRUCTOR)
-    list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-assert.c aarch64/sme-libc-routines.c)
+    list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-libc-mem-routines-sve.S aarch64/sme-abi-assert.c aarch64/sme-libc-routines.c)
     message(STATUS "AArch64 SME ABI routines enabled")
     set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
   else()
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines-sve.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines-sve.S
new file mode 100644
index 0000000000000..fd5c234237326
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines-sve.S
@@ -0,0 +1,169 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Routines taken from libc/AOR_v20.02/string/aarch64
+
+#include "../assembly.h"
+
+// These versions use SVE and FP registers. Only use on supported targets.
+#if defined(__ARM_FEATURE_SVE) && __ARM_FP != 0
+
+//
+//  __arm_sc_memcpy / __arm_sc_memmove
+//
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define tmp1	x6
+#define vlen	x6
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+   SVE vectors are used to speedup small copies.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The source pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+DEFINE_COMPILERRT_FUNCTION(__arm_sc_memcpy)
+	cmp	count, 128
+	b.hi	3f  // copy_long
+	cntb	vlen
+	cmp	count, vlen, lsl 1
+	b.hi	0f  // copy32_128
+
+	whilelo p0.b, xzr, count
+	whilelo p1.b, vlen, count
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p1/z, [src, 1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p1, [dstin, 1, mul vl]
+	ret
+
+	/* Medium copies: 33..128 bytes.  */
+0:  // copy32_128
+	add	srcend, src, count
+	add	dstend, dstin, count
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	1f  // copy128
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy 65..128 bytes.  */
+1:  // copy128
+	ldp	E_q, F_q, [src, 32]
+	cmp	count, 96
+	b.ls	2f  // copy96
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
+2:  // copy96
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy more than 128 bytes.  */
+3:  // copy_long
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cmp	tmp1, count
+	b.lo	6f  // copy_long_backwards
+
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	5f  // copy64_from_end
+4:  // loop64
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	4b  // loop64
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+5:  // copy64_from_end
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
+	ret
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+6:  // copy_long_backwards
+	cbz	tmp1, 9f  // return
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	8f  // copy64_from_start
+
+7:  // loop64_backwards
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
+	subs	count, count, 64
+	b.hi	7b  // loop64_backwards
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+8:  // copy64_from_start
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+9: // return
+	ret
+END_COMPILERRT_FUNCTION (__arm_sc_memcpy)
+
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
+
+#endif
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
index e736829967c0c..9b9ee1c6bd860 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
@@ -6,6 +6,8 @@
 
 #include "../assembly.h"
 
+#if !defined(__ARM_FEATURE_SVE) || __ARM_FP == 0
+
 //
 //  __arm_sc_memcpy / __arm_sc_memmove
 //
@@ -234,8 +236,10 @@ END_COMPILERRT_FUNCTION(__arm_sc_memcpy)
 
 DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
 
+#endif // !defined(__ARM_FEATURE_SVE) || __ARM_FP == 0
+
 // This version uses FP registers. Use this only on targets with them
-#if defined(__aarch64__) && __ARM_FP != 0
+#if __ARM_FP != 0
 //
 //  __arm_sc_memset
 //
@@ -346,4 +350,4 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
         ret
 END_COMPILERRT_FUNCTION(__arm_sc_memset)
 
-#endif // __aarch64__
+#endif // __ARM_FP != 0