diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 9dfd25f9a8d43..678d711eeba2b 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1366,19 +1366,23 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
       // If we are casting a fixed i8 vector to a scalable i1 predicate
       // vector, use a vector insert and bitcast the result.
       if (ScalableDstTy->getElementType()->isIntegerTy(1) &&
-          ScalableDstTy->getElementCount().isKnownMultipleOf(8) &&
           FixedSrcTy->getElementType()->isIntegerTy(8)) {
         ScalableDstTy = llvm::ScalableVectorType::get(
             FixedSrcTy->getElementType(),
-            ScalableDstTy->getElementCount().getKnownMinValue() / 8);
+            llvm::divideCeil(
+                ScalableDstTy->getElementCount().getKnownMinValue(), 8));
       }
       if (ScalableDstTy->getElementType() == FixedSrcTy->getElementType()) {
         auto *Load = CGF.Builder.CreateLoad(Src);
         auto *PoisonVec = llvm::PoisonValue::get(ScalableDstTy);
         llvm::Value *Result = CGF.Builder.CreateInsertVector(
             ScalableDstTy, PoisonVec, Load, uint64_t(0), "cast.scalable");
-        if (ScalableDstTy != Ty)
-          Result = CGF.Builder.CreateBitCast(Result, Ty);
+        ScalableDstTy = cast<llvm::ScalableVectorType>(
+            llvm::VectorType::getWithSizeAndScalar(ScalableDstTy, Ty));
+        if (Result->getType() != ScalableDstTy)
+          Result = CGF.Builder.CreateBitCast(Result, ScalableDstTy);
+        if (Result->getType() != Ty)
+          Result = CGF.Builder.CreateExtractVector(Ty, Result, uint64_t(0));
         return Result;
       }
     }
@@ -1476,8 +1480,14 @@ CoerceScalableToFixed(CodeGenFunction &CGF, llvm::FixedVectorType *ToTy,
   // If we are casting a scalable i1 predicate vector to a fixed i8
   // vector, first bitcast the source.
   if (FromTy->getElementType()->isIntegerTy(1) &&
-      FromTy->getElementCount().isKnownMultipleOf(8) &&
       ToTy->getElementType() == CGF.Builder.getInt8Ty()) {
+    if (!FromTy->getElementCount().isKnownMultipleOf(8)) {
+      FromTy = llvm::ScalableVectorType::get(
+          FromTy->getElementType(),
+          llvm::alignTo<8>(FromTy->getElementCount().getKnownMinValue()));
+      llvm::Value *ZeroVec = llvm::Constant::getNullValue(FromTy);
+      V = CGF.Builder.CreateInsertVector(FromTy, ZeroVec, V, uint64_t(0));
+    }
     FromTy = llvm::ScalableVectorType::get(
         ToTy->getElementType(),
         FromTy->getElementCount().getKnownMinValue() / 8);
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index f639a87e3ad0b..d080844447f14 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2492,18 +2492,22 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
         // If we are casting a fixed i8 vector to a scalable i1 predicate
         // vector, use a vector insert and bitcast the result.
         if (ScalableDstTy->getElementType()->isIntegerTy(1) &&
-            ScalableDstTy->getElementCount().isKnownMultipleOf(8) &&
             FixedSrcTy->getElementType()->isIntegerTy(8)) {
           ScalableDstTy = llvm::ScalableVectorType::get(
               FixedSrcTy->getElementType(),
-              ScalableDstTy->getElementCount().getKnownMinValue() / 8);
+              llvm::divideCeil(
+                  ScalableDstTy->getElementCount().getKnownMinValue(), 8));
         }
         if (FixedSrcTy->getElementType() == ScalableDstTy->getElementType()) {
           llvm::Value *PoisonVec = llvm::PoisonValue::get(ScalableDstTy);
           llvm::Value *Result = Builder.CreateInsertVector(
               ScalableDstTy, PoisonVec, Src, uint64_t(0), "cast.scalable");
+          ScalableDstTy = cast<llvm::ScalableVectorType>(
+              llvm::VectorType::getWithSizeAndScalar(ScalableDstTy, DstTy));
+          if (Result->getType() != ScalableDstTy)
+            Result = Builder.CreateBitCast(Result, ScalableDstTy);
           if (Result->getType() != DstTy)
-            Result = Builder.CreateBitCast(Result, DstTy);
+            Result = Builder.CreateExtractVector(DstTy, Result, uint64_t(0));
           return Result;
         }
       }
@@ -2517,8 +2521,17 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
         // If we are casting a scalable i1 predicate vector to a fixed i8
         // vector, bitcast the source and use a vector extract.
         if (ScalableSrcTy->getElementType()->isIntegerTy(1) &&
-            ScalableSrcTy->getElementCount().isKnownMultipleOf(8) &&
             FixedDstTy->getElementType()->isIntegerTy(8)) {
+          if (!ScalableSrcTy->getElementCount().isKnownMultipleOf(8)) {
+            ScalableSrcTy = llvm::ScalableVectorType::get(
+                ScalableSrcTy->getElementType(),
+                llvm::alignTo<8>(
+                    ScalableSrcTy->getElementCount().getKnownMinValue()));
+            llvm::Value *ZeroVec = llvm::Constant::getNullValue(ScalableSrcTy);
+            Src = Builder.CreateInsertVector(ScalableSrcTy, ZeroVec, Src,
+                                             uint64_t(0));
+          }
+
           ScalableSrcTy = llvm::ScalableVectorType::get(
               FixedDstTy->getElementType(),
               ScalableSrcTy->getElementCount().getKnownMinValue() / 8);
diff --git a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c
index e2f02dc64f766..3ab065d34bcfb 100644
--- a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c
+++ b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-call.c
@@ -15,24 +15,12 @@ typedef vbool64_t fixed_bool64_t __attribute__((riscv_rvv_vector_bits(__riscv_v_
 
 // CHECK-64-LABEL: @call_bool32_ff(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1_COERCE:%.*]], <vscale x 2 x i1> [[OP2_COERCE:%.*]], i64 2)
-// CHECK-64-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10:![0-9]+]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
+// CHECK-64-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0:%.*]], <vscale x 2 x i1> [[TMP1:%.*]], i64 2)
 // CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
 //
 // CHECK-128-LABEL: @call_bool32_ff(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1_COERCE:%.*]], <vscale x 2 x i1> [[OP2_COERCE:%.*]], i64 4)
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10:![0-9]+]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
+// CHECK-128-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0:%.*]], <vscale x 2 x i1> [[TMP1:%.*]], i64 4)
 // CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
 //
 fixed_bool32_t call_bool32_ff(fixed_bool32_t op1, fixed_bool32_t op2) {
@@ -41,24 +29,12 @@ fixed_bool32_t call_bool32_ff(fixed_bool32_t op1, fixed_bool32_t op2) {
 
 // CHECK-64-LABEL: @call_bool64_ff(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1_COERCE:%.*]], <vscale x 1 x i1> [[OP2_COERCE:%.*]], i64 1)
-// CHECK-64-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA11:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
+// CHECK-64-NEXT:    [[TMP2:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[TMP0:%.*]], <vscale x 1 x i1> [[TMP1:%.*]], i64 1)
 // CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
 //
 // CHECK-128-LABEL: @call_bool64_ff(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE4:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1_COERCE:%.*]], <vscale x 1 x i1> [[OP2_COERCE:%.*]], i64 2)
-// CHECK-128-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA11:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE4]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
+// CHECK-128-NEXT:    [[TMP2:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[TMP0:%.*]], <vscale x 1 x i1> [[TMP1:%.*]], i64 2)
 // CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
 //
 fixed_bool64_t call_bool64_ff(fixed_bool64_t op1, fixed_bool64_t op2) {
@@ -71,25 +47,13 @@ fixed_bool64_t call_bool64_ff(fixed_bool64_t op1, fixed_bool64_t op2) {
 
 // CHECK-64-LABEL: @call_bool32_fs(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1_COERCE:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 2)
-// CHECK-64-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA6]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
+// CHECK-64-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 2)
+// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
 //
 // CHECK-128-LABEL: @call_bool32_fs(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1_COERCE:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 4)
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 4)
+// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
 //
 fixed_bool32_t call_bool32_fs(fixed_bool32_t op1, vbool32_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 32);
@@ -97,25 +61,13 @@ fixed_bool32_t call_bool32_fs(fixed_bool32_t op1, vbool32_t op2) {
 
 // CHECK-64-LABEL: @call_bool64_fs(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1_COERCE:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 1)
-// CHECK-64-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA11]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
+// CHECK-64-NEXT:    [[TMP1:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[TMP0:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 1)
+// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP1]]
 //
 // CHECK-128-LABEL: @call_bool64_fs(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE2:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1_COERCE:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 2)
-// CHECK-128-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA11]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE2]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
+// CHECK-128-NEXT:    [[TMP1:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[TMP0:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 2)
+// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP1]]
 //
 fixed_bool64_t call_bool64_fs(fixed_bool64_t op1, vbool64_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 64);
@@ -127,25 +79,13 @@ fixed_bool64_t call_bool64_fs(fixed_bool64_t op1, vbool64_t op2) {
 
 // CHECK-64-LABEL: @call_bool32_ss(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-64-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 2)
-// CHECK-64-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
+// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
 //
 // CHECK-128-LABEL: @call_bool32_ss(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[OP1:%.*]], <vscale x 2 x i1> [[OP2:%.*]], i64 4)
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
+// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
 //
 fixed_bool32_t call_bool32_ss(vbool32_t op1, vbool32_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 32);
@@ -153,25 +93,13 @@ fixed_bool32_t call_bool32_ss(vbool32_t op1, vbool32_t op2) {
 
 // CHECK-64-LABEL: @call_bool64_ss(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
 // CHECK-64-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 1)
-// CHECK-64-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11]]
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
+// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
 //
 // CHECK-128-LABEL: @call_bool64_ss(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
 // CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.riscv.vmand.nxv1i1.i64(<vscale x 1 x i1> [[OP1:%.*]], <vscale x 1 x i1> [[OP2:%.*]], i64 2)
-// CHECK-128-NEXT:    store <vscale x 1 x i1> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP2:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
+// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
 //
 fixed_bool64_t call_bool64_ss(vbool64_t op1, vbool64_t op2) {
   return __riscv_vmand(op1, op2, __riscv_v_fixed_vlen / 64);
diff --git a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c
index f0fa7e8d07b4d..8407c065adb21 100644
--- a/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c
+++ b/clang/test/CodeGen/RISCV/attr-riscv-rvv-vector-bits-less-8-cast.c
@@ -29,46 +29,22 @@ fixed_bool8_t from_vbool8_t(vbool8_t type) {
 
 // CHECK-64-LABEL: @from_vbool16_t(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 4 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i1>, align 1
-// CHECK-64-NEXT:    store <vscale x 4 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10:![0-9]+]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+// CHECK-64-NEXT:    ret <vscale x 4 x i1> [[TYPE:%.*]]
 //
 // CHECK-128-LABEL: @from_vbool16_t(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 4 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i1>, align 1
-// CHECK-128-NEXT:    store <vscale x 4 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10:![0-9]+]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+// CHECK-128-NEXT:    ret <vscale x 4 x i1> [[TYPE:%.*]]
 //
 fixed_bool16_t from_vbool16_t(vbool16_t type) {
   return type;
 }
 // CHECK-64-LABEL: @from_vbool32_t(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-64-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TYPE:%.*]]
 //
 // CHECK-128-LABEL: @from_vbool32_t(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TYPE:%.*]]
 //
 fixed_bool32_t from_vbool32_t(vbool32_t type) {
   return type;
@@ -76,11 +52,11 @@ fixed_bool32_t from_vbool32_t(vbool32_t type) {
 
 // CHECK-64-LABEL: @to_vbool32_t(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-64-NEXT:    ret <vscale x 2 x i1> [[TMP0:%.*]]
 //
 // CHECK-128-LABEL: @to_vbool32_t(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP0:%.*]]
 //
 vbool32_t to_vbool32_t(fixed_bool32_t type) {
   return type;
@@ -88,23 +64,11 @@ vbool32_t to_vbool32_t(fixed_bool32_t type) {
 
 // CHECK-64-LABEL: @from_vbool64_t(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-64-NEXT:    store <vscale x 1 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA13:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-64-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    [[TMP1:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP1]]
+// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TYPE:%.*]]
 //
 // CHECK-128-LABEL: @from_vbool64_t(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    store <vscale x 1 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA13:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA10]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 1 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP1]]
+// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TYPE:%.*]]
 //
 fixed_bool64_t from_vbool64_t(vbool64_t type) {
   return type;
@@ -112,11 +76,11 @@ fixed_bool64_t from_vbool64_t(vbool64_t type) {
 
 // CHECK-64-LABEL: @to_vbool64_t(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-64-NEXT:    ret <vscale x 1 x i1> [[TMP0:%.*]]
 //
 // CHECK-128-LABEL: @to_vbool64_t(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP0:%.*]]
 //
 vbool64_t to_vbool64_t(fixed_bool64_t type) {
   return type;
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c
index 058ec49b77881..45a099dc9c678 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c
@@ -55,12 +55,12 @@ DEFINE_STRUCT(bool64)
 
 // CHECK-128-LABEL: @read_bool32(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1
 // CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+// CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i8> @llvm.vector.insert.nxv1i8.v1i8(<vscale x 1 x i8> poison, <1 x i8> [[TMP0]], i64 0)
+// CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 1 x i8> [[CAST_SCALABLE]] to <vscale x 8 x i1>
+// CHECK-128-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[TMP1]], i64 0)
+// CHECK-128-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
 //
 vbool32_t read_bool32(struct struct_bool32 *s) {
   return s->y[0];
@@ -68,11 +68,11 @@ vbool32_t read_bool32(struct struct_bool32 *s) {
 
 // CHECK-128-LABEL: @write_bool32(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-128-NEXT:    store <vscale x 2 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv2i1(<vscale x 8 x i1> zeroinitializer, <vscale x 2 x i1> [[X:%.*]], i64 0)
+// CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i1> [[TMP0]] to <vscale x 1 x i8>
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i8> @llvm.vector.extract.v1i8.nxv1i8(<vscale x 1 x i8> [[TMP1]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[Y]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr [[Y]], align 1, !tbaa [[TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 void write_bool32(struct struct_bool32 *s, vbool32_t x) {
@@ -81,12 +81,12 @@ void write_bool32(struct struct_bool32 *s, vbool32_t x) {
 
 // CHECK-128-LABEL: @read_bool64(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1
 // CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    [[TMP1:%.*]] = load <vscale x 1 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP1]]
+// CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i8> @llvm.vector.insert.nxv1i8.v1i8(<vscale x 1 x i8> poison, <1 x i8> [[TMP0]], i64 0)
+// CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 1 x i8> [[CAST_SCALABLE]] to <vscale x 8 x i1>
+// CHECK-128-NEXT:    [[TMP2:%.*]] = tail call <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv8i1(<vscale x 8 x i1> [[TMP1]], i64 0)
+// CHECK-128-NEXT:    ret <vscale x 1 x i1> [[TMP2]]
 //
 vbool64_t read_bool64(struct struct_bool64 *s) {
   return s->y[0];
@@ -94,11 +94,11 @@ vbool64_t read_bool64(struct struct_bool64 *s) {
 
 // CHECK-128-LABEL: @write_bool64(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 1 x i1>, align 1
-// CHECK-128-NEXT:    store <vscale x 1 x i1> [[X:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA11:![0-9]+]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> zeroinitializer, <vscale x 1 x i1> [[X:%.*]], i64 0)
+// CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i1> [[TMP0]] to <vscale x 1 x i8>
+// CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i8> @llvm.vector.extract.v1i8.nxv1i8(<vscale x 1 x i8> [[TMP1]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S:%.*]], i64 1
-// CHECK-128-NEXT:    store <1 x i8> [[TMP0]], ptr [[Y]], align 1, !tbaa [[TBAA6]]
+// CHECK-128-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr [[Y]], align 1, !tbaa [[TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 void write_bool64(struct struct_bool64 *s, vbool64_t x) {
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
index 7992951346d54..0a50e41dda7e1 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
@@ -97,13 +97,7 @@ vbool4_t to_vbool4_t(fixed_bool4_t type) {
 
 // CHECK-LABEL: @from_vbool32_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-NEXT:    store <vscale x 2 x i1> [[TYPE:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA8:![0-9]+]]
-// CHECK-NEXT:    store <1 x i8> [[TMP0]], ptr [[RETVAL_COERCE]], align 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+// CHECK-NEXT:    ret <vscale x 2 x i1> [[TYPE:%.*]]
 //
 fixed_bool32_t from_vbool32_t(vbool32_t type) {
   return type;
@@ -111,7 +105,7 @@ fixed_bool32_t from_vbool32_t(vbool32_t type) {
 
 // CHECK-LABEL: @to_vbool32_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret <vscale x 2 x i1> [[TYPE_COERCE:%.*]]
+// CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP0:%.*]]
 //
 vbool32_t to_vbool32_t(fixed_bool32_t type) {
   return type;
@@ -119,7 +113,7 @@ vbool32_t to_vbool32_t(fixed_bool32_t type) {
 
 // CHECK-LABEL: @to_vint32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA6:![0-9]+]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -130,7 +124,7 @@ vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-LABEL: @from_vint32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA6]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
@@ -139,7 +133,7 @@ gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
 
 // CHECK-LABEL: @to_fixed_int32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -150,7 +144,7 @@ fixed_int32m1_t to_fixed_int32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-LABEL: @from_fixed_int32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA8]]
+// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA6]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_fixed_int32m1_t__to_gnu_int32m1_t(fixed_int32m1_t type) {
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c
index d81855aea2e5e..f01e6caeefd43 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-codegen.c
@@ -113,25 +113,25 @@ fixed_int16m4_t test_bool4(vbool4_t m, vint16m4_t vec) {
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca <vscale x 2 x i32>, align 4
 // CHECK-NEXT:    [[MASK:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
 // CHECK-NEXT:    store <vscale x 2 x i1> [[M:%.*]], ptr [[M_ADDR]], align 1
 // CHECK-NEXT:    store <vscale x 2 x i32> [[VEC:%.*]], ptr [[VEC_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i1>, ptr [[M_ADDR]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr @global_bool32, align 1
-// CHECK-NEXT:    store <1 x i8> [[TMP1]], ptr [[SAVED_VALUE]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE]], align 1
-// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP2]], i64 8)
-// CHECK-NEXT:    store <vscale x 2 x i1> [[TMP3]], ptr [[MASK]], align 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load <vscale x 2 x i1>, ptr [[MASK]], align 1
-// CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 2 x i32>, ptr [[VEC_ADDR]], align 4
-// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @global_vec, align 8
-// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TMP6]], i64 0)
-// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadd.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> [[TMP5]], <vscale x 2 x i32> [[CAST_SCALABLE]], <vscale x 2 x i1> [[TMP4]], i64 8, i64 3)
-// CHECK-NEXT:    [[CAST_FIXED:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TMP7]], i64 0)
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 1 x i8> @llvm.vector.insert.nxv1i8.v1i8(<vscale x 1 x i8> poison, <1 x i8> [[TMP1]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 1 x i8> [[CAST_SCALABLE]] to <vscale x 8 x i1>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmand.nxv2i1.i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP3]], i64 8)
+// CHECK-NEXT:    store <vscale x 2 x i1> [[TMP4]], ptr [[MASK]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 2 x i1>, ptr [[MASK]], align 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <vscale x 2 x i32>, ptr [[VEC_ADDR]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @global_vec, align 8
+// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TMP7]], i64 0)
+// CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vadd.mask.nxv2i32.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> [[TMP6]], <vscale x 2 x i32> [[CAST_SCALABLE1]], <vscale x 2 x i1> [[TMP5]], i64 8, i64 3)
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TMP8]], i64 0)
 // CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr [[RETVAL]], align 8
-// CHECK-NEXT:    [[CAST_SCALABLE1:%.*]] = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TMP8]], i64 0)
-// CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE1]]
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr [[RETVAL]], align 8
+// CHECK-NEXT:    [[CAST_SCALABLE2:%.*]] = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TMP9]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE2]]
 //
 fixed_int32m1_t test_bool32(vbool32_t m, vint32m1_t vec) {
   vbool32_t mask = __riscv_vmand(m, global_bool32, __riscv_v_fixed_vlen/32);
@@ -224,15 +224,16 @@ fixed_bool4_t address_of_array_idx_bool4() {
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca <1 x i8>, align 1
 // CHECK-NEXT:    [[ARR:%.*]] = alloca [3 x <1 x i8>], align 1
 // CHECK-NEXT:    [[PARR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 2 x i1>, align 1
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i8>], ptr [[ARR]], i64 0, i64 0
 // CHECK-NEXT:    store ptr [[ARRAYIDX]], ptr [[PARR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PARR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[TMP0]], align 1
 // CHECK-NEXT:    store <1 x i8> [[TMP1]], ptr [[RETVAL]], align 1
-// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RETVAL_COERCE]], ptr align 1 [[RETVAL]], i64 1, i1 false)
-// CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x i1>, ptr [[RETVAL_COERCE]], align 1
-// CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 1 x i8> @llvm.vector.insert.nxv1i8.v1i8(<vscale x 1 x i8> poison, <1 x i8> [[TMP2]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 1 x i8> [[CAST_SCALABLE]] to <vscale x 8 x i1>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[TMP3]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP4]]
 //
 fixed_bool32_t address_of_array_idx_bool32() {
   fixed_bool32_t arr[3];
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
index 4bd6311e05b03..92ba27fb65425 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
@@ -89,10 +89,10 @@ void write_global_bool4(vbool4_t v) { global_bool4 = v; }
 #if __riscv_v_fixed_vlen >= 256
 // CHECK-256-LABEL: @write_global_bool32(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <vscale x 2 x i1>, align 1
-// CHECK-256-NEXT:    store <vscale x 2 x i1> [[V:%.*]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA9:![0-9]+]]
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-256-NEXT:    store <1 x i8> [[TMP0]], ptr @global_bool32, align 1, !tbaa [[TBAA6]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv2i1(<vscale x 8 x i1> zeroinitializer, <vscale x 2 x i1> [[V:%.*]], i64 0)
+// CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i1> [[TMP0]] to <vscale x 1 x i8>
+// CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i8> @llvm.vector.extract.v1i8.nxv1i8(<vscale x 1 x i8> [[TMP1]], i64 0)
+// CHECK-256-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr @global_bool32, align 1, !tbaa [[TBAA6]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_bool32(vbool32_t v) { global_bool32 = v; }
@@ -151,11 +151,11 @@ vbool4_t read_global_bool4() { return global_bool4; }
 #if __riscv_v_fixed_vlen >= 256
 // CHECK-256-LABEL: @read_global_bool32(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[SAVED_VALUE:%.*]] = alloca <1 x i8>, align 1
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr @global_bool32, align 1, !tbaa [[TBAA6]]
-// CHECK-256-NEXT:    store <1 x i8> [[TMP0]], ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-256-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x i1>, ptr [[SAVED_VALUE]], align 1, !tbaa [[TBAA6]]
-// CHECK-256-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+// CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i8> @llvm.vector.insert.nxv1i8.v1i8(<vscale x 1 x i8> poison, <1 x i8> [[TMP0]], i64 0)
+// CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 1 x i8> [[CAST_SCALABLE]] to <vscale x 8 x i1>
+// CHECK-256-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[TMP1]], i64 0)
+// CHECK-256-NEXT:    ret <vscale x 2 x i1> [[TMP2]]
 //
 vbool32_t read_global_bool32() { return global_bool32; }
 #endif
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index d0dffa9de616a..fba6f45d37d1d 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -554,6 +554,23 @@ class VectorType : public Type {
     return VectorType::get(VTy->getElementType(), EltCnt * 2);
   }
 
+  /// This static method attempts to construct a VectorType with the same
+  /// size-in-bits as SizeTy but with an element type that matches the scalar
+  /// type of EltTy. The VectorType is returned on success, nullptr otherwise.
+  static VectorType *getWithSizeAndScalar(VectorType *SizeTy, Type *EltTy) {
+    if (SizeTy->getScalarType() == EltTy->getScalarType())
+      return SizeTy;
+
+    unsigned EltSize = EltTy->getScalarSizeInBits();
+    if (!SizeTy->getPrimitiveSizeInBits().isKnownMultipleOf(EltSize))
+      return nullptr;
+
+    ElementCount EC = SizeTy->getElementCount()
+                          .multiplyCoefficientBy(SizeTy->getScalarSizeInBits())
+                          .divideCoefficientBy(EltSize);
+    return VectorType::get(EltTy->getScalarType(), EC);
+  }
+
   /// Return true if the specified type is valid as a element type.
   static bool isValidElementType(Type *ElemTy);