Skip to content

Commit cc95e40

Browse files
authored
[msan] Handle AVX512 vector down convert (non-mem) intrinsics (#147606)
This handles `llvm.x86.avx512.mask.pmov{,s,us}.*.512` using `handleIntrinsicByApplyingToShadow()` where possible, otherwise using a customized slow-path handler, `handleAVX512VectorDownConvert()`. Note that shadow propagation of `pmov{s,us}` (signed/unsigned saturation) are approximated using truncation. Future work could extend `handleAVX512VectorDownConvert()` to use `GetMinMaxUnsigned()` to handle saturation precisely.
1 parent 0819fec commit cc95e40

File tree

3 files changed

+446
-535
lines changed

3 files changed

+446
-535
lines changed

llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4592,6 +4592,90 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
45924592
ConstantInt::get(IRB.getInt32Ty(), 0));
45934593
}
45944594

4595+
// Handle llvm.x86.avx512.mask.pmov{,s,us}.*.512
4596+
//
4597+
// e.g., call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512
4598+
// (<8 x i64>, <16 x i8>, i8)
4599+
// A WriteThru Mask
4600+
//
4601+
// call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512
4602+
// (<16 x i32>, <16 x i8>, i16)
4603+
//
4604+
// Dst[i] = Mask[i] ? truncate_or_saturate(A[i]) : WriteThru[i]
4605+
// Dst_shadow[i] = Mask[i] ? truncate(A_shadow[i]) : WriteThru_shadow[i]
4606+
//
4607+
// If Dst has more elements than A, the excess elements are zeroed (and the
4608+
// corresponding shadow is initialized).
4609+
//
4610+
// Note: for PMOV (truncation), handleIntrinsicByApplyingToShadow is precise
4611+
// and is much faster than this handler.
4612+
void handleAVX512VectorDownConvert(IntrinsicInst &I) {
4613+
IRBuilder<> IRB(&I);
4614+
4615+
assert(I.arg_size() == 3);
4616+
Value *A = I.getOperand(0);
4617+
Value *WriteThrough = I.getOperand(1);
4618+
Value *Mask = I.getOperand(2);
4619+
4620+
assert(isa<FixedVectorType>(A->getType()));
4621+
assert(A->getType()->isIntOrIntVectorTy());
4622+
4623+
assert(isa<FixedVectorType>(WriteThrough->getType()));
4624+
assert(WriteThrough->getType()->isIntOrIntVectorTy());
4625+
4626+
unsigned ANumElements =
4627+
cast<FixedVectorType>(A->getType())->getNumElements();
4628+
unsigned OutputNumElements =
4629+
cast<FixedVectorType>(WriteThrough->getType())->getNumElements();
4630+
assert(ANumElements == OutputNumElements ||
4631+
ANumElements * 2 == OutputNumElements);
4632+
4633+
assert(Mask->getType()->isIntegerTy());
4634+
assert(Mask->getType()->getScalarSizeInBits() == ANumElements);
4635+
insertCheckShadowOf(Mask, &I);
4636+
4637+
assert(I.getType() == WriteThrough->getType());
4638+
4639+
// Widen the mask, if necessary, to have one bit per element of the output
4640+
// vector.
4641+
// We want the extra bits to have '1's, so that the CreateSelect will
4642+
// select the values from AShadow instead of WriteThroughShadow ("maskless"
4643+
// versions of the intrinsics are sometimes implemented using an all-1's
4644+
// mask and an undefined value for WriteThroughShadow). We accomplish this
4645+
// by using bitwise NOT before and after the ZExt.
4646+
if (ANumElements != OutputNumElements) {
4647+
Mask = IRB.CreateNot(Mask);
4648+
Mask = IRB.CreateZExt(Mask, Type::getIntNTy(*MS.C, OutputNumElements),
4649+
"_ms_widen_mask");
4650+
Mask = IRB.CreateNot(Mask);
4651+
}
4652+
Mask = IRB.CreateBitCast(
4653+
Mask, FixedVectorType::get(IRB.getInt1Ty(), OutputNumElements));
4654+
4655+
Value *AShadow = getShadow(A);
4656+
4657+
// The return type might have more elements than the input.
4658+
// Temporarily shrink the return type's number of elements.
4659+
VectorType *ShadowType = maybeShrinkVectorShadowType(A, I);
4660+
4661+
// PMOV truncates; PMOVS/PMOVUS uses signed/unsigned saturation.
4662+
// This handler treats them all as truncation, which leads to some rare
4663+
// false positives in the cases where the truncated bytes could
4664+
// unambiguously saturate the value e.g., if A = ??????10 ????????
4665+
// (big-endian), the unsigned saturated byte conversion is 11111111 i.e.,
4666+
// fully defined, but the truncated byte is ????????.
4667+
//
4668+
// TODO: use GetMinMaxUnsigned() to handle saturation precisely.
4669+
AShadow = IRB.CreateTrunc(AShadow, ShadowType, "_ms_trunc_shadow");
4670+
AShadow = maybeExtendVectorShadowWithZeros(AShadow, I);
4671+
4672+
Value *WriteThroughShadow = getShadow(WriteThrough);
4673+
4674+
Value *Shadow = IRB.CreateSelect(Mask, AShadow, WriteThroughShadow);
4675+
setShadow(&I, Shadow);
4676+
setOriginForNaryOp(I);
4677+
}
4678+
45954679
// For sh.* compiler intrinsics:
45964680
// llvm.x86.avx512fp16.mask.{add/sub/mul/div/max/min}.sh.round
45974681
// (<8 x half>, <8 x half>, <8 x half>, i8, i32)
@@ -5412,6 +5496,66 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
54125496
break;
54135497
}
54145498

5499+
// AVX512 PMOV: Packed MOV, with truncation
5500+
// Precisely handled by applying the same intrinsic to the shadow
5501+
case Intrinsic::x86_avx512_mask_pmov_dw_512:
5502+
case Intrinsic::x86_avx512_mask_pmov_db_512:
5503+
case Intrinsic::x86_avx512_mask_pmov_qb_512:
5504+
case Intrinsic::x86_avx512_mask_pmov_qw_512: {
5505+
// Intrinsic::x86_avx512_mask_pmov_{qd,wb}_512 were removed in
5506+
// f608dc1f5775ee880e8ea30e2d06ab5a4a935c22
5507+
handleIntrinsicByApplyingToShadow(I, I.getIntrinsicID(),
5508+
/*trailingVerbatimArgs=*/1);
5509+
break;
5510+
}
5511+
5512+
// AVX512 PMVOV{S,US}: Packed MOV, with signed/unsigned saturation
5513+
// Approximately handled using the corresponding truncation intrinsic
5514+
// TODO: improve handleAVX512VectorDownConvert to precisely model saturation
5515+
case Intrinsic::x86_avx512_mask_pmovs_dw_512:
5516+
case Intrinsic::x86_avx512_mask_pmovus_dw_512: {
5517+
handleIntrinsicByApplyingToShadow(I,
5518+
Intrinsic::x86_avx512_mask_pmov_dw_512,
5519+
/* trailingVerbatimArgs=*/1);
5520+
break;
5521+
}
5522+
5523+
case Intrinsic::x86_avx512_mask_pmovs_db_512:
5524+
case Intrinsic::x86_avx512_mask_pmovus_db_512: {
5525+
handleIntrinsicByApplyingToShadow(I,
5526+
Intrinsic::x86_avx512_mask_pmov_db_512,
5527+
/* trailingVerbatimArgs=*/1);
5528+
break;
5529+
}
5530+
5531+
case Intrinsic::x86_avx512_mask_pmovs_qb_512:
5532+
case Intrinsic::x86_avx512_mask_pmovus_qb_512: {
5533+
handleIntrinsicByApplyingToShadow(I,
5534+
Intrinsic::x86_avx512_mask_pmov_qb_512,
5535+
/* trailingVerbatimArgs=*/1);
5536+
break;
5537+
}
5538+
5539+
case Intrinsic::x86_avx512_mask_pmovs_qw_512:
5540+
case Intrinsic::x86_avx512_mask_pmovus_qw_512: {
5541+
handleIntrinsicByApplyingToShadow(I,
5542+
Intrinsic::x86_avx512_mask_pmov_qw_512,
5543+
/* trailingVerbatimArgs=*/1);
5544+
break;
5545+
}
5546+
5547+
case Intrinsic::x86_avx512_mask_pmovs_qd_512:
5548+
case Intrinsic::x86_avx512_mask_pmovus_qd_512:
5549+
case Intrinsic::x86_avx512_mask_pmovs_wb_512:
5550+
case Intrinsic::x86_avx512_mask_pmovus_wb_512: {
5551+
// Since Intrinsic::x86_avx512_mask_pmov_{qd,wb}_512 do not exist, we
5552+
// cannot use handleIntrinsicByApplyingToShadow. Instead, we call the
5553+
// slow-path handler.
5554+
handleAVX512VectorDownConvert(I);
5555+
break;
5556+
}
5557+
5558+
// AVX512 FP16 Arithmetic
54155559
case Intrinsic::x86_avx512fp16_mask_add_sh_round:
54165560
case Intrinsic::x86_avx512fp16_mask_sub_sh_round:
54175561
case Intrinsic::x86_avx512fp16_mask_mul_sh_round:

0 commit comments

Comments
 (0)