; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NODOT
; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,NODOT
; RUN: llc -mtriple=riscv32 -mattr=+v,+experimental-zvqdotq -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,DOT,DOT32
; RUN: llc -mtriple=riscv64 -mattr=+v,+experimental-zvqdotq -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,DOT,DOT64

define i32 @vqdot_vv(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; NODOT-LABEL: vqdot_vv:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v16, v8
; NODOT-NEXT:    vsext.vf2 v20, v10
; NODOT-NEXT:    vwmul.vv v8, v16, v20
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.s.x v16, zero
; NODOT-NEXT:    vredsum.vs v8, v8, v16
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: vqdot_vv:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v12, 0
; DOT-NEXT:    vqdot.vv v12, v8, v10
; DOT-NEXT:    vmv.s.x v8, zero
; DOT-NEXT:    vredsum.vs v8, v12, v8
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.sext = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.sext, %b.sext
  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %mul)
  ret i32 %res
}

define i32 @vqdot_vx_constant(<vscale x 16 x i8> %a) {
; CHECK-LABEL: vqdot_vx_constant:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT:    vsext.vf2 v16, v8
; CHECK-NEXT:    li a0, 23
; CHECK-NEXT:    vwmul.vx v8, v16, a0
; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT:    vmv.s.x v16, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v16
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.sext, splat (i32 23)
  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %mul)
  ret i32 %res
}

define i32 @vqdot_vx_constant_swapped(<vscale x 16 x i8> %a) {
; CHECK-LABEL: vqdot_vx_constant_swapped:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT:    vsext.vf2 v16, v8
; CHECK-NEXT:    li a0, 23
; CHECK-NEXT:    vwmul.vx v8, v16, a0
; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT:    vmv.s.x v16, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v16
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> splat (i32 23), %a.sext
  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %mul)
  ret i32 %res
}

define i32 @vqdotu_vv(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; NODOT-LABEL: vqdotu_vv:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
; NODOT-NEXT:    vwmulu.vv v12, v8, v10
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.s.x v8, zero
; NODOT-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
; NODOT-NEXT:    vwredsumu.vs v8, v12, v8
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: vqdotu_vv:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v12, 0
; DOT-NEXT:    vqdotu.vv v12, v8, v10
; DOT-NEXT:    vmv.s.x v8, zero
; DOT-NEXT:    vredsum.vs v8, v12, v8
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.zext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.zext = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.zext, %b.zext
  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %mul)
  ret i32 %res
}

define i32 @vqdotu_vx_constant(<vscale x 16 x i8> %a) {
; CHECK-LABEL: vqdotu_vx_constant:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT:    vzext.vf2 v16, v8
; CHECK-NEXT:    li a0, 123
; CHECK-NEXT:    vwmulu.vx v8, v16, a0
; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT:    vmv.s.x v16, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v16
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
entry:
  %a.zext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.zext, splat (i32 123)
  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %mul)
  ret i32 %res
}

define i32 @vqdotsu_vv(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; NODOT-LABEL: vqdotsu_vv:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v16, v8
; NODOT-NEXT:    vzext.vf2 v20, v10
; NODOT-NEXT:    vwmulsu.vv v8, v16, v20
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.s.x v16, zero
; NODOT-NEXT:    vredsum.vs v8, v8, v16
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: vqdotsu_vv:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v12, 0
; DOT-NEXT:    vqdotsu.vv v12, v8, v10
; DOT-NEXT:    vmv.s.x v8, zero
; DOT-NEXT:    vredsum.vs v8, v12, v8
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.zext = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.sext, %b.zext
  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %mul)
  ret i32 %res
}

define i32 @vqdotsu_vv_swapped(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; NODOT-LABEL: vqdotsu_vv_swapped:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v16, v8
; NODOT-NEXT:    vzext.vf2 v20, v10
; NODOT-NEXT:    vwmulsu.vv v8, v16, v20
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.s.x v16, zero
; NODOT-NEXT:    vredsum.vs v8, v8, v16
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: vqdotsu_vv_swapped:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v12, 0
; DOT-NEXT:    vqdotsu.vv v12, v8, v10
; DOT-NEXT:    vmv.s.x v8, zero
; DOT-NEXT:    vredsum.vs v8, v12, v8
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.zext = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %b.zext, %a.sext
  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %mul)
  ret i32 %res
}

define i32 @vdotqsu_vx_constant(<vscale x 16 x i8> %a) {
; CHECK-LABEL: vdotqsu_vx_constant:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT:    vsext.vf2 v16, v8
; CHECK-NEXT:    li a0, 123
; CHECK-NEXT:    vwmul.vx v8, v16, a0
; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT:    vmv.s.x v16, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v16
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.sext, splat (i32 123)
  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %mul)
  ret i32 %res
}

define i32 @vdotqus_vx_constant(<vscale x 16 x i8> %a) {
; CHECK-LABEL: vdotqus_vx_constant:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT:    vzext.vf2 v16, v8
; CHECK-NEXT:    li a0, -23
; CHECK-NEXT:    vmv.v.x v20, a0
; CHECK-NEXT:    vwmulsu.vv v8, v20, v16
; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT:    vmv.s.x v16, zero
; CHECK-NEXT:    vredsum.vs v8, v8, v16
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
entry:
  %a.zext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.zext, splat (i32 -23)
  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %mul)
  ret i32 %res
}

define i32 @reduce_of_sext(<vscale x 16 x i8> %a) {
; NODOT-LABEL: reduce_of_sext:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
; NODOT-NEXT:    vsext.vf4 v16, v8
; NODOT-NEXT:    vmv.s.x v8, zero
; NODOT-NEXT:    vredsum.vs v8, v16, v8
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: reduce_of_sext:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
; DOT-NEXT:    vmv.v.i v10, 1
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v12, 0
; DOT-NEXT:    vqdot.vv v12, v8, v10
; DOT-NEXT:    vmv.s.x v8, zero
; DOT-NEXT:    vredsum.vs v8, v12, v8
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %a.ext)
  ret i32 %res
}

define i32 @reduce_of_zext(<vscale x 16 x i8> %a) {
; NODOT-LABEL: reduce_of_zext:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
; NODOT-NEXT:    vzext.vf4 v16, v8
; NODOT-NEXT:    vmv.s.x v8, zero
; NODOT-NEXT:    vredsum.vs v8, v16, v8
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: reduce_of_zext:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
; DOT-NEXT:    vmv.v.i v10, 1
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v12, 0
; DOT-NEXT:    vqdotu.vv v12, v8, v10
; DOT-NEXT:    vmv.s.x v8, zero
; DOT-NEXT:    vredsum.vs v8, v12, v8
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %res = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %a.ext)
  ret i32 %res
}

define i32 @vqdot_vv_accum(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i32> %x) {
; NODOT-LABEL: vqdot_vv_accum:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v12, v8
; NODOT-NEXT:    vsext.vf2 v24, v10
; NODOT-NEXT:    vwmacc.vv v16, v12, v24
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.s.x v8, zero
; NODOT-NEXT:    vredsum.vs v8, v16, v8
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: vqdot_vv_accum:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.s.x v12, zero
; DOT-NEXT:    vqdot.vv v16, v8, v10
; DOT-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
; DOT-NEXT:    vredsum.vs v8, v16, v12
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.sext = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.sext, %b.sext
  %add = add <vscale x 16 x i32> %mul, %x
  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %add)
  ret i32 %sum
}

define i32 @vqdotu_vv_accum(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i32> %x) {
; NODOT-LABEL: vqdotu_vv_accum:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
; NODOT-NEXT:    vwmulu.vv v12, v8, v10
; NODOT-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
; NODOT-NEXT:    vwaddu.wv v16, v16, v12
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.s.x v8, zero
; NODOT-NEXT:    vredsum.vs v8, v16, v8
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: vqdotu_vv_accum:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.s.x v12, zero
; DOT-NEXT:    vqdotu.vv v16, v8, v10
; DOT-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
; DOT-NEXT:    vredsum.vs v8, v16, v12
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.zext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.zext = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.zext, %b.zext
  %add = add <vscale x 16 x i32> %mul, %x
  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %add)
  ret i32 %sum
}

define i32 @vqdotsu_vv_accum(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i32> %x) {
; NODOT-LABEL: vqdotsu_vv_accum:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v12, v8
; NODOT-NEXT:    vzext.vf2 v24, v10
; NODOT-NEXT:    vwmaccsu.vv v16, v12, v24
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.s.x v8, zero
; NODOT-NEXT:    vredsum.vs v8, v16, v8
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: vqdotsu_vv_accum:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.s.x v12, zero
; DOT-NEXT:    vqdotsu.vv v16, v8, v10
; DOT-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
; DOT-NEXT:    vredsum.vs v8, v16, v12
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.zext = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.sext, %b.zext
  %add = add <vscale x 16 x i32> %mul, %x
  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %add)
  ret i32 %sum
}

define i32 @vqdot_vv_scalar_add(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 %x) {
; NODOT-LABEL: vqdot_vv_scalar_add:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v16, v8
; NODOT-NEXT:    vsext.vf2 v20, v10
; NODOT-NEXT:    vwmul.vv v8, v16, v20
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.s.x v16, a0
; NODOT-NEXT:    vredsum.vs v8, v8, v16
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: vqdot_vv_scalar_add:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v12, 0
; DOT-NEXT:    vqdot.vv v12, v8, v10
; DOT-NEXT:    vmv.s.x v8, a0
; DOT-NEXT:    vredsum.vs v8, v12, v8
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.sext = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.sext, %b.sext
  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %mul)
  %add = add i32 %sum, %x
  ret i32 %add
}

define i32 @vqdotu_vv_scalar_add(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 %x) {
; NODOT-LABEL: vqdotu_vv_scalar_add:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
; NODOT-NEXT:    vwmulu.vv v12, v8, v10
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.s.x v8, a0
; NODOT-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
; NODOT-NEXT:    vwredsumu.vs v8, v12, v8
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: vqdotu_vv_scalar_add:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v12, 0
; DOT-NEXT:    vqdotu.vv v12, v8, v10
; DOT-NEXT:    vmv.s.x v8, a0
; DOT-NEXT:    vredsum.vs v8, v12, v8
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.zext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.zext = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.zext, %b.zext
  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %mul)
  %add = add i32 %sum, %x
  ret i32 %add
}

define i32 @vqdotsu_vv_scalar_add(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 %x) {
; NODOT-LABEL: vqdotsu_vv_scalar_add:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v16, v8
; NODOT-NEXT:    vzext.vf2 v20, v10
; NODOT-NEXT:    vwmulsu.vv v8, v16, v20
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.s.x v16, a0
; NODOT-NEXT:    vredsum.vs v8, v8, v16
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: vqdotsu_vv_scalar_add:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v12, 0
; DOT-NEXT:    vqdotsu.vv v12, v8, v10
; DOT-NEXT:    vmv.s.x v8, a0
; DOT-NEXT:    vredsum.vs v8, v12, v8
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.zext = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.sext, %b.zext
  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %mul)
  %add = add i32 %sum, %x
  ret i32 %add
}

define i32 @vqdot_vv_split(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d) {
; NODOT-LABEL: vqdot_vv_split:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v16, v8
; NODOT-NEXT:    vsext.vf2 v20, v10
; NODOT-NEXT:    vsext.vf2 v24, v12
; NODOT-NEXT:    vsext.vf2 v28, v14
; NODOT-NEXT:    vwmul.vv v8, v16, v20
; NODOT-NEXT:    vwmacc.vv v8, v24, v28
; NODOT-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
; NODOT-NEXT:    vmv.s.x v16, zero
; NODOT-NEXT:    vredsum.vs v8, v8, v16
; NODOT-NEXT:    vmv.x.s a0, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: vqdot_vv_split:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v16, 0
; DOT-NEXT:    vqdot.vv v16, v8, v10
; DOT-NEXT:    vqdot.vv v16, v12, v14
; DOT-NEXT:    vmv.s.x v8, zero
; DOT-NEXT:    vredsum.vs v8, v16, v8
; DOT-NEXT:    vmv.x.s a0, v8
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.sext = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.sext, %b.sext
  %c.sext = sext <vscale x 16 x i8> %c to <vscale x 16 x i32>
  %d.sext = sext <vscale x 16 x i8> %d to <vscale x 16 x i32>
  %mul2 = mul <vscale x 16 x i32> %c.sext, %d.sext
  %add = add <vscale x 16 x i32> %mul, %mul2
  %sum = tail call i32 @llvm.vector.reduce.add.v16i32(<vscale x 16 x i32> %add)
  ret i32 %sum
}


define <vscale x 1 x i32> @partial_reduce_nf2(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
; NODOT-LABEL: partial_reduce_nf2:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
; NODOT-NEXT:    vsext.vf2 v10, v8
; NODOT-NEXT:    vsext.vf2 v11, v9
; NODOT-NEXT:    csrr a0, vlenb
; NODOT-NEXT:    vwmul.vv v8, v10, v11
; NODOT-NEXT:    srli a0, a0, 3
; NODOT-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
; NODOT-NEXT:    vslidedown.vx v10, v9, a0
; NODOT-NEXT:    vslidedown.vx v11, v8, a0
; NODOT-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
; NODOT-NEXT:    vadd.vv v8, v10, v8
; NODOT-NEXT:    vadd.vv v9, v11, v9
; NODOT-NEXT:    vadd.vv v8, v9, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: partial_reduce_nf2:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
; DOT-NEXT:    vmv.v.i v10, 0
; DOT-NEXT:    vqdot.vv v10, v8, v9
; DOT-NEXT:    vmv1r.v v8, v10
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 4 x i8> %a to <vscale x 4 x i32>
  %b.sext = sext <vscale x 4 x i8> %b to <vscale x 4 x i32>
  %mul = mul <vscale x 4 x i32> %a.sext, %b.sext
  %res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
  ret <vscale x 1 x i32> %res
}

define <vscale x 2 x i32> @partial_reduce_m1(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
; NODOT-LABEL: partial_reduce_m1:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
; NODOT-NEXT:    vsext.vf2 v12, v8
; NODOT-NEXT:    vsext.vf2 v14, v9
; NODOT-NEXT:    vwmul.vv v8, v12, v14
; NODOT-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
; NODOT-NEXT:    vadd.vv v8, v11, v8
; NODOT-NEXT:    vadd.vv v9, v9, v10
; NODOT-NEXT:    vadd.vv v8, v9, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: partial_reduce_m1:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
; DOT-NEXT:    vmv.v.i v10, 0
; DOT-NEXT:    vqdot.vv v10, v8, v9
; DOT-NEXT:    vmv.v.v v8, v10
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 8 x i8> %a to <vscale x 8 x i32>
  %b.sext = sext <vscale x 8 x i8> %b to <vscale x 8 x i32>
  %mul = mul <vscale x 8 x i32> %a.sext, %b.sext
  %res = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 2 x i32> zeroinitializer, <vscale x 8 x i32> %mul)
  ret <vscale x 2 x i32> %res
}

define <vscale x 4 x i32> @partial_reduce_m2(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; NODOT-LABEL: partial_reduce_m2:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v16, v8
; NODOT-NEXT:    vsext.vf2 v20, v10
; NODOT-NEXT:    vwmul.vv v8, v16, v20
; NODOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; NODOT-NEXT:    vadd.vv v8, v14, v8
; NODOT-NEXT:    vadd.vv v10, v10, v12
; NODOT-NEXT:    vadd.vv v8, v10, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: partial_reduce_m2:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v12, 0
; DOT-NEXT:    vqdot.vv v12, v8, v10
; DOT-NEXT:    vmv.v.v v8, v12
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.sext = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.sext, %b.sext
  %res = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %mul)
  ret <vscale x 4 x i32> %res
}

define <vscale x 8 x i32> @partial_reduce_m4(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
; NODOT-LABEL: partial_reduce_m4:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v24, v8
; NODOT-NEXT:    vsext.vf2 v16, v10
; NODOT-NEXT:    vsext.vf2 v28, v12
; NODOT-NEXT:    vsext.vf2 v20, v14
; NODOT-NEXT:    vwmul.vv v8, v16, v20
; NODOT-NEXT:    vwmul.vv v16, v24, v28
; NODOT-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
; NODOT-NEXT:    vadd.vv v16, v20, v16
; NODOT-NEXT:    vadd.vv v8, v12, v8
; NODOT-NEXT:    vadd.vv v8, v8, v16
; NODOT-NEXT:    ret
;
; DOT-LABEL: partial_reduce_m4:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
; DOT-NEXT:    vmv.v.i v16, 0
; DOT-NEXT:    vqdot.vv v16, v8, v12
; DOT-NEXT:    vmv.v.v v8, v16
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 32 x i8> %a to <vscale x 32 x i32>
  %b.sext = sext <vscale x 32 x i8> %b to <vscale x 32 x i32>
  %mul = mul <vscale x 32 x i32> %a.sext, %b.sext
  %res = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 8 x i32> zeroinitializer, <vscale x 32 x i32> %mul)
  ret <vscale x 8 x i32> %res
}

define <vscale x 16 x i32> @partial_reduce_m8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b) {
; NODOT-LABEL: partial_reduce_m8:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    addi sp, sp, -16
; NODOT-NEXT:    .cfi_def_cfa_offset 16
; NODOT-NEXT:    csrr a0, vlenb
; NODOT-NEXT:    slli a0, a0, 2
; NODOT-NEXT:    sub sp, sp, a0
; NODOT-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
; NODOT-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v24, v10
; NODOT-NEXT:    addi a0, sp, 16
; NODOT-NEXT:    vs4r.v v24, (a0) # vscale x 32-byte Folded Spill
; NODOT-NEXT:    vsext.vf2 v0, v8
; NODOT-NEXT:    vsext.vf2 v8, v18
; NODOT-NEXT:    vsext.vf2 v4, v16
; NODOT-NEXT:    vwmul.vv v24, v0, v4
; NODOT-NEXT:    vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
; NODOT-NEXT:    vwmacc.vv v24, v16, v8
; NODOT-NEXT:    vsext.vf2 v8, v12
; NODOT-NEXT:    vsext.vf2 v16, v20
; NODOT-NEXT:    vwmacc.vv v24, v8, v16
; NODOT-NEXT:    vsext.vf2 v8, v14
; NODOT-NEXT:    vsext.vf2 v12, v22
; NODOT-NEXT:    vwmacc.vv v24, v8, v12
; NODOT-NEXT:    vmv8r.v v8, v24
; NODOT-NEXT:    csrr a0, vlenb
; NODOT-NEXT:    slli a0, a0, 2
; NODOT-NEXT:    add sp, sp, a0
; NODOT-NEXT:    .cfi_def_cfa sp, 16
; NODOT-NEXT:    addi sp, sp, 16
; NODOT-NEXT:    .cfi_def_cfa_offset 0
; NODOT-NEXT:    ret
;
; DOT-LABEL: partial_reduce_m8:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
; DOT-NEXT:    vmv.v.i v24, 0
; DOT-NEXT:    vqdot.vv v24, v8, v16
; DOT-NEXT:    vmv.v.v v8, v24
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 64 x i8> %a to <vscale x 64 x i32>
  %b.sext = sext <vscale x 64 x i8> %b to <vscale x 64 x i32>
  %mul = mul <vscale x 64 x i32> %a.sext, %b.sext
  %res = call <vscale x 16 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 16 x i32> zeroinitializer, <vscale x 64 x i32> %mul)
  ret <vscale x 16 x i32> %res
}

define <vscale x 32 x i32> @partial_reduce_m16(<vscale x 128 x i8> %a, <vscale x 128 x i8> %b) {
; NODOT-LABEL: partial_reduce_m16:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    addi sp, sp, -16
; NODOT-NEXT:    .cfi_def_cfa_offset 16
; NODOT-NEXT:    csrr a1, vlenb
; NODOT-NEXT:    slli a1, a1, 3
; NODOT-NEXT:    mv a2, a1
; NODOT-NEXT:    slli a1, a1, 1
; NODOT-NEXT:    add a1, a1, a2
; NODOT-NEXT:    sub sp, sp, a1
; NODOT-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; NODOT-NEXT:    csrr a1, vlenb
; NODOT-NEXT:    slli a1, a1, 4
; NODOT-NEXT:    add a1, sp, a1
; NODOT-NEXT:    addi a1, a1, 16
; NODOT-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; NODOT-NEXT:    addi a1, sp, 16
; NODOT-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
; NODOT-NEXT:    vl8r.v v16, (a0)
; NODOT-NEXT:    csrr a1, vlenb
; NODOT-NEXT:    slli a1, a1, 3
; NODOT-NEXT:    add a1, sp, a1
; NODOT-NEXT:    addi a1, a1, 16
; NODOT-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; NODOT-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v4, v8
; NODOT-NEXT:    vsext.vf2 v0, v16
; NODOT-NEXT:    vwmul.vv v24, v4, v0
; NODOT-NEXT:    vsext.vf2 v4, v10
; NODOT-NEXT:    vsext.vf2 v8, v18
; NODOT-NEXT:    vwmacc.vv v24, v4, v8
; NODOT-NEXT:    csrr a1, vlenb
; NODOT-NEXT:    slli a1, a1, 3
; NODOT-NEXT:    add a0, a0, a1
; NODOT-NEXT:    vsext.vf2 v0, v12
; NODOT-NEXT:    vl8r.v v8, (a0)
; NODOT-NEXT:    csrr a0, vlenb
; NODOT-NEXT:    slli a0, a0, 3
; NODOT-NEXT:    add a0, sp, a0
; NODOT-NEXT:    addi a0, a0, 16
; NODOT-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; NODOT-NEXT:    vsext.vf2 v4, v20
; NODOT-NEXT:    vwmacc.vv v24, v0, v4
; NODOT-NEXT:    csrr a0, vlenb
; NODOT-NEXT:    slli a0, a0, 4
; NODOT-NEXT:    add a0, sp, a0
; NODOT-NEXT:    addi a0, a0, 16
; NODOT-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
; NODOT-NEXT:    vsext.vf2 v20, v0
; NODOT-NEXT:    vsext.vf2 v16, v8
; NODOT-NEXT:    vwmul.vv v0, v20, v16
; NODOT-NEXT:    csrr a0, vlenb
; NODOT-NEXT:    slli a0, a0, 4
; NODOT-NEXT:    add a0, sp, a0
; NODOT-NEXT:    addi a0, a0, 16
; NODOT-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; NODOT-NEXT:    vsext.vf2 v20, v18
; NODOT-NEXT:    vsext.vf2 v16, v10
; NODOT-NEXT:    vwmacc.vv v0, v20, v16
; NODOT-NEXT:    csrr a0, vlenb
; NODOT-NEXT:    slli a0, a0, 4
; NODOT-NEXT:    add a0, sp, a0
; NODOT-NEXT:    addi a0, a0, 16
; NODOT-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; NODOT-NEXT:    vsext.vf2 v8, v20
; NODOT-NEXT:    vsext.vf2 v16, v12
; NODOT-NEXT:    vwmacc.vv v0, v8, v16
; NODOT-NEXT:    csrr a0, vlenb
; NODOT-NEXT:    slli a0, a0, 4
; NODOT-NEXT:    add a0, sp, a0
; NODOT-NEXT:    addi a0, a0, 16
; NODOT-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; NODOT-NEXT:    vsext.vf2 v8, v22
; NODOT-NEXT:    vsext.vf2 v16, v14
; NODOT-NEXT:    vwmacc.vv v0, v8, v16
; NODOT-NEXT:    addi a0, sp, 16
; NODOT-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; NODOT-NEXT:    vsext.vf2 v8, v14
; NODOT-NEXT:    csrr a0, vlenb
; NODOT-NEXT:    slli a0, a0, 3
; NODOT-NEXT:    add a0, sp, a0
; NODOT-NEXT:    addi a0, a0, 16
; NODOT-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; NODOT-NEXT:    vsext.vf2 v12, v22
; NODOT-NEXT:    vwmacc.vv v24, v8, v12
; NODOT-NEXT:    vmv8r.v v8, v24
; NODOT-NEXT:    vmv8r.v v16, v0
; NODOT-NEXT:    csrr a0, vlenb
; NODOT-NEXT:    slli a0, a0, 3
; NODOT-NEXT:    mv a1, a0
; NODOT-NEXT:    slli a0, a0, 1
; NODOT-NEXT:    add a0, a0, a1
; NODOT-NEXT:    add sp, sp, a0
; NODOT-NEXT:    .cfi_def_cfa sp, 16
; NODOT-NEXT:    addi sp, sp, 16
; NODOT-NEXT:    .cfi_def_cfa_offset 0
; NODOT-NEXT:    ret
;
; DOT-LABEL: partial_reduce_m16:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    addi sp, sp, -16
; DOT-NEXT:    .cfi_def_cfa_offset 16
; DOT-NEXT:    csrr a1, vlenb
; DOT-NEXT:    slli a1, a1, 3
; DOT-NEXT:    mv a2, a1
; DOT-NEXT:    slli a1, a1, 1
; DOT-NEXT:    add a1, a1, a2
; DOT-NEXT:    sub sp, sp, a1
; DOT-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; DOT-NEXT:    csrr a1, vlenb
; DOT-NEXT:    slli a1, a1, 4
; DOT-NEXT:    add a1, sp, a1
; DOT-NEXT:    addi a1, a1, 16
; DOT-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
; DOT-NEXT:    csrr a1, vlenb
; DOT-NEXT:    slli a1, a1, 3
; DOT-NEXT:    add a1, sp, a1
; DOT-NEXT:    addi a1, a1, 16
; DOT-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
; DOT-NEXT:    csrr a1, vlenb
; DOT-NEXT:    slli a1, a1, 3
; DOT-NEXT:    add a1, a0, a1
; DOT-NEXT:    vl8r.v v8, (a0)
; DOT-NEXT:    vl8r.v v16, (a1)
; DOT-NEXT:    addi a0, sp, 16
; DOT-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
; DOT-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
; DOT-NEXT:    vmv.v.i v24, 0
; DOT-NEXT:    vmv.v.i v0, 0
; DOT-NEXT:    csrr a0, vlenb
; DOT-NEXT:    slli a0, a0, 3
; DOT-NEXT:    add a0, sp, a0
; DOT-NEXT:    addi a0, a0, 16
; DOT-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; DOT-NEXT:    vqdot.vv v0, v16, v8
; DOT-NEXT:    csrr a0, vlenb
; DOT-NEXT:    slli a0, a0, 4
; DOT-NEXT:    add a0, sp, a0
; DOT-NEXT:    addi a0, a0, 16
; DOT-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
; DOT-NEXT:    addi a0, sp, 16
; DOT-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
; DOT-NEXT:    vqdot.vv v24, v8, v16
; DOT-NEXT:    vmv.v.v v8, v0
; DOT-NEXT:    vmv.v.v v16, v24
; DOT-NEXT:    csrr a0, vlenb
; DOT-NEXT:    slli a0, a0, 3
; DOT-NEXT:    mv a1, a0
; DOT-NEXT:    slli a0, a0, 1
; DOT-NEXT:    add a0, a0, a1
; DOT-NEXT:    add sp, sp, a0
; DOT-NEXT:    .cfi_def_cfa sp, 16
; DOT-NEXT:    addi sp, sp, 16
; DOT-NEXT:    .cfi_def_cfa_offset 0
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 128 x i8> %a to <vscale x 128 x i32>
  %b.sext = sext <vscale x 128 x i8> %b to <vscale x 128 x i32>
  %mul = mul <vscale x 128 x i32> %a.sext, %b.sext
  %res = call <vscale x 32 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 32 x i32> zeroinitializer, <vscale x 128 x i32> %mul)
  ret <vscale x 32 x i32> %res
}

define <vscale x 4 x i32> @partial_reduce_accum(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 4 x i32> %accum) {
; NODOT-LABEL: partial_reduce_accum:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; NODOT-NEXT:    vsext.vf2 v24, v8
; NODOT-NEXT:    vsext.vf2 v28, v10
; NODOT-NEXT:    vwmul.vv v16, v24, v28
; NODOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; NODOT-NEXT:    vadd.vv v8, v18, v20
; NODOT-NEXT:    vadd.vv v10, v12, v16
; NODOT-NEXT:    vadd.vv v10, v22, v10
; NODOT-NEXT:    vadd.vv v8, v8, v10
; NODOT-NEXT:    ret
;
; DOT-LABEL: partial_reduce_accum:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vqdot.vv v12, v8, v10
; DOT-NEXT:    vmv.v.v v8, v12
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.sext = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.sext, %b.sext
  %res = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 4 x i32> %accum, <vscale x 16 x i32> %mul)
  ret <vscale x 4 x i32> %res
}

define <vscale x 16 x i32> @partial_reduce_via_accum(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: partial_reduce_via_accum:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT:    vsext.vf2 v16, v8
; CHECK-NEXT:    vsext.vf2 v20, v10
; CHECK-NEXT:    vwmul.vv v8, v16, v20
; CHECK-NEXT:    ret
entry:
  %a.sext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %b.sext = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
  %mul = mul <vscale x 16 x i32> %a.sext, %b.sext
  %res = call <vscale x 16 x i32> @llvm.experimental.vector.partial.reduce.add.nvx16i32.nvx16i32(<vscale x 16 x i32> %mul, <vscale x 16 x i32> zeroinitializer)
  ret <vscale x 16 x i32> %res
}

define <vscale x 1 x i32> @partial_reduce_vqdotu(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
; NODOT-LABEL: partial_reduce_vqdotu:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
; NODOT-NEXT:    vwmulu.vv v10, v8, v9
; NODOT-NEXT:    csrr a0, vlenb
; NODOT-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
; NODOT-NEXT:    vzext.vf2 v8, v10
; NODOT-NEXT:    srli a0, a0, 3
; NODOT-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
; NODOT-NEXT:    vslidedown.vx v10, v9, a0
; NODOT-NEXT:    vslidedown.vx v11, v8, a0
; NODOT-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
; NODOT-NEXT:    vadd.vv v8, v10, v8
; NODOT-NEXT:    vadd.vv v9, v11, v9
; NODOT-NEXT:    vadd.vv v8, v9, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: partial_reduce_vqdotu:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
; DOT-NEXT:    vmv.v.i v10, 0
; DOT-NEXT:    vqdotu.vv v10, v8, v9
; DOT-NEXT:    vmv1r.v v8, v10
; DOT-NEXT:    ret
entry:
  %a.sext = zext <vscale x 4 x i8> %a to <vscale x 4 x i32>
  %b.sext = zext <vscale x 4 x i8> %b to <vscale x 4 x i32>
  %mul = mul <vscale x 4 x i32> %a.sext, %b.sext
  %res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
  ret <vscale x 1 x i32> %res
}

define <vscale x 1 x i32> @partial_reduce_vqdotsu(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) {
; NODOT-LABEL: partial_reduce_vqdotsu:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
; NODOT-NEXT:    vsext.vf2 v10, v8
; NODOT-NEXT:    vzext.vf2 v11, v9
; NODOT-NEXT:    csrr a0, vlenb
; NODOT-NEXT:    vwmulsu.vv v8, v10, v11
; NODOT-NEXT:    srli a0, a0, 3
; NODOT-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
; NODOT-NEXT:    vslidedown.vx v10, v9, a0
; NODOT-NEXT:    vslidedown.vx v11, v8, a0
; NODOT-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
; NODOT-NEXT:    vadd.vv v8, v10, v8
; NODOT-NEXT:    vadd.vv v9, v11, v9
; NODOT-NEXT:    vadd.vv v8, v9, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: partial_reduce_vqdotsu:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
; DOT-NEXT:    vmv.v.i v10, 0
; DOT-NEXT:    vqdotsu.vv v10, v8, v9
; DOT-NEXT:    vmv1r.v v8, v10
; DOT-NEXT:    ret
entry:
  %a.sext = sext <vscale x 4 x i8> %a to <vscale x 4 x i32>
  %b.sext = zext <vscale x 4 x i8> %b to <vscale x 4 x i32>
  %mul = mul <vscale x 4 x i32> %a.sext, %b.sext
  %res = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 1 x i32> zeroinitializer, <vscale x 4 x i32> %mul)
  ret <vscale x 1 x i32> %res
}


define <vscale x 4 x i32> @partial_of_sext(<vscale x 16 x i8> %a) {
; NODOT-LABEL: partial_of_sext:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
; NODOT-NEXT:    vsext.vf4 v16, v8
; NODOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; NODOT-NEXT:    vadd.vv v8, v22, v16
; NODOT-NEXT:    vadd.vv v10, v18, v20
; NODOT-NEXT:    vadd.vv v8, v10, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: partial_of_sext:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
; DOT-NEXT:    vmv.v.i v12, 1
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v10, 0
; DOT-NEXT:    vqdot.vv v10, v8, v12
; DOT-NEXT:    vmv.v.v v8, v10
; DOT-NEXT:    ret
entry:
  %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %res = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %a.ext)
  ret <vscale x 4 x i32> %res
}

define <vscale x 4 x i32> @partial_of_zext(<vscale x 16 x i8> %a) {
; NODOT-LABEL: partial_of_zext:
; NODOT:       # %bb.0: # %entry
; NODOT-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
; NODOT-NEXT:    vzext.vf4 v16, v8
; NODOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; NODOT-NEXT:    vadd.vv v8, v22, v16
; NODOT-NEXT:    vadd.vv v10, v18, v20
; NODOT-NEXT:    vadd.vv v8, v10, v8
; NODOT-NEXT:    ret
;
; DOT-LABEL: partial_of_zext:
; DOT:       # %bb.0: # %entry
; DOT-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
; DOT-NEXT:    vmv.v.i v12, 1
; DOT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
; DOT-NEXT:    vmv.v.i v10, 0
; DOT-NEXT:    vqdotu.vv v10, v8, v12
; DOT-NEXT:    vmv.v.v v8, v10
; DOT-NEXT:    ret
entry:
  %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
  %res = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i32> %a.ext)
  ret <vscale x 4 x i32> %res
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; DOT32: {{.*}}
; DOT64: {{.*}}
