; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX
; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}

; In this test, we check that all the addressing modes are lowered correctly
; for 256-bit invariant loads, which get lowered to ld.global.nc
; addr can be any of the following:
; - avar : direct address
; - asi: direct address + offset
; - areg_64: 64-bit register
; - ari_64: 64-bit register + offset
; Since this is a blackwell+ feature,
; and support for 32-bit addressing does not exist after sm_90,
; the "areg" and "ari" 32-bit addressing modes are not tested or supported.

; For invariant loads, asi is historically not supported,
; and instead it is selected as move into register, add of offset, and loaded as areg64

; Checks 8 types: i8, i16, bfloat, half, i32, i64, float, double

; Global is the only address space that currently supports 256-bit loads/stores

@globalin = external addrspace(1) global ptr
@globalout = external addrspace(1) global ptr

define void @avar_i8() {
; PTX-LABEL: avar_i8(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
; PTX-NEXT:    st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %load = load <32 x i8>, ptr addrspace(1) @globalin, !invariant.load !0
  store <32 x i8> %load, ptr addrspace(1) @globalout
  ret void
}

define void @avar_i16() {
; PTX-LABEL: avar_i16(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
; PTX-NEXT:    st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %load = load <16 x i16>, ptr addrspace(1) @globalin, !invariant.load !0
  store <16 x i16> %load, ptr addrspace(1) @globalout
  ret void
}

define void @avar_half() {
; PTX-LABEL: avar_half(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
; PTX-NEXT:    st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %load = load <16 x half>, ptr addrspace(1) @globalin, !invariant.load !0
  store <16 x half> %load, ptr addrspace(1) @globalout
  ret void
}

define void @avar_bfloat() {
; PTX-LABEL: avar_bfloat(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
; PTX-NEXT:    st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %load = load <16 x bfloat>, ptr addrspace(1) @globalin, !invariant.load !0
  store <16 x bfloat> %load, ptr addrspace(1) @globalout
  ret void
}

define void @avar_i32() {
; PTX-LABEL: avar_i32(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
; PTX-NEXT:    st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %load = load <8 x i32>, ptr addrspace(1) @globalin, !invariant.load !0
  store <8 x i32> %load, ptr addrspace(1) @globalout
  ret void
}

define void @avar_i64() {
; PTX-LABEL: avar_i64(
; PTX:       {
; PTX-NEXT:    .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
; PTX-NEXT:    st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT:    ret;
  %load = load <4 x i64>, ptr addrspace(1) @globalin, !invariant.load !0
  store <4 x i64> %load, ptr addrspace(1) @globalout
  ret void
}

define void @avar_float() {
; PTX-LABEL: avar_float(
; PTX:       {
; PTX-NEXT:    .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
; PTX-NEXT:    st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT:    ret;
  %load = load <8 x float>, ptr addrspace(1) @globalin, !invariant.load !0
  store <8 x float> %load, ptr addrspace(1) @globalout
  ret void
}

define void @avar_double() {
; PTX-LABEL: avar_double(
; PTX:       {
; PTX-NEXT:    .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
; PTX-NEXT:    st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT:    ret;
  %load = load <4 x double>, ptr addrspace(1) @globalin, !invariant.load !0
  store <4 x double> %load, ptr addrspace(1) @globalout
  ret void
}

define void @asi_i8() {
; PTX-LABEL: asi_i8(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
; PTX-NEXT:    st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
  %load = load <32 x i8>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
  store <32 x i8> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @asi_i16() {
; PTX-LABEL: asi_i16(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
; PTX-NEXT:    st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
  %load = load <16 x i16>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
  store <16 x i16> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @asi_half() {
; PTX-LABEL: asi_half(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
; PTX-NEXT:    st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
  %load = load <16 x half>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
  store <16 x half> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @asi_bfloat() {
; PTX-LABEL: asi_bfloat(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
; PTX-NEXT:    st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
  %load = load <16 x bfloat>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
  store <16 x bfloat> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @asi_i32() {
; PTX-LABEL: asi_i32(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
; PTX-NEXT:    st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
  %load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
  store <8 x i32> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @asi_i64() {
; PTX-LABEL: asi_i64(
; PTX:       {
; PTX-NEXT:    .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
; PTX-NEXT:    st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
  %load = load <4 x i64>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
  store <4 x i64> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @asi_float() {
; PTX-LABEL: asi_float(
; PTX:       {
; PTX-NEXT:    .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
; PTX-NEXT:    st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
  %load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
  store <8 x float> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @asi_double() {
; PTX-LABEL: asi_double(
; PTX:       {
; PTX-NEXT:    .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
; PTX-NEXT:    st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
  %load = load <4 x double>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
  store <4 x double> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @areg_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_i8(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_i8_param_0];
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; PTX-NEXT:    ld.param.b64 %rd2, [areg_64_i8_param_1];
; PTX-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %load = load <32 x i8>, ptr addrspace(1) %in, !invariant.load !0
  store <32 x i8> %load, ptr addrspace(1) %out
  ret void
}
define void @areg_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_i16(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_i16_param_0];
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; PTX-NEXT:    ld.param.b64 %rd2, [areg_64_i16_param_1];
; PTX-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %load = load <16 x i16>, ptr addrspace(1) %in, !invariant.load !0
  store <16 x i16> %load, ptr addrspace(1) %out
  ret void
}
define void @areg_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_half(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_half_param_0];
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; PTX-NEXT:    ld.param.b64 %rd2, [areg_64_half_param_1];
; PTX-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %load = load <16 x half>, ptr addrspace(1) %in, !invariant.load !0
  store <16 x half> %load, ptr addrspace(1) %out
  ret void
}
define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_bfloat(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_bfloat_param_0];
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; PTX-NEXT:    ld.param.b64 %rd2, [areg_64_bfloat_param_1];
; PTX-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %load = load <16 x bfloat>, ptr addrspace(1) %in, !invariant.load !0
  store <16 x bfloat> %load, ptr addrspace(1) %out
  ret void
}

define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_i32(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_i32_param_0];
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
; PTX-NEXT:    ld.param.b64 %rd2, [areg_64_i32_param_1];
; PTX-NEXT:    st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %load = load <8 x i32>, ptr addrspace(1) %in, !invariant.load !0
  store <8 x i32> %load, ptr addrspace(1) %out
  ret void
}

define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_i64(
; PTX:       {
; PTX-NEXT:    .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_i64_param_0];
; PTX-NEXT:    ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; PTX-NEXT:    ld.param.b64 %rd6, [areg_64_i64_param_1];
; PTX-NEXT:    st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; PTX-NEXT:    ret;
  %load = load <4 x i64>, ptr addrspace(1) %in, !invariant.load !0
  store <4 x i64> %load, ptr addrspace(1) %out
  ret void
}

define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_float(
; PTX:       {
; PTX-NEXT:    .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_float_param_0];
; PTX-NEXT:    ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; PTX-NEXT:    ld.param.b64 %rd6, [areg_64_float_param_1];
; PTX-NEXT:    st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; PTX-NEXT:    ret;
  %load = load <8 x float>, ptr addrspace(1) %in, !invariant.load !0
  store <8 x float> %load, ptr addrspace(1) %out
  ret void
}

define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: areg_64_double(
; PTX:       {
; PTX-NEXT:    .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [areg_64_double_param_0];
; PTX-NEXT:    ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
; PTX-NEXT:    ld.param.b64 %rd6, [areg_64_double_param_1];
; PTX-NEXT:    st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
; PTX-NEXT:    ret;
  %load = load <4 x double>, ptr addrspace(1) %in, !invariant.load !0
  store <4 x double> %load, ptr addrspace(1) %out
  ret void
}

define void @ari_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_i8(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_i8_param_0];
; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_i8_param_1];
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
; PTX-NEXT:    st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
  %load = load <32 x i8>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
  store <32 x i8> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @ari_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_i16(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_i16_param_0];
; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_i16_param_1];
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
; PTX-NEXT:    st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
  %load = load <16 x i16>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
  store <16 x i16> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @ari_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_half(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_half_param_0];
; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_half_param_1];
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
; PTX-NEXT:    st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
  %load = load <16 x half>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
  store <16 x half> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_bfloat(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_bfloat_param_0];
; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_bfloat_param_1];
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
; PTX-NEXT:    st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
  %load = load <16 x bfloat>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
  store <16 x bfloat> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_i32(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<9>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_i32_param_0];
; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_i32_param_1];
; PTX-NEXT:    ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
; PTX-NEXT:    st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
  %load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
  store <8 x i32> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_i64(
; PTX:       {
; PTX-NEXT:    .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_i64_param_0];
; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_i64_param_1];
; PTX-NEXT:    ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
; PTX-NEXT:    st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
  %load = load <4 x i64>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
  store <4 x i64> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_float(
; PTX:       {
; PTX-NEXT:    .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_float_param_0];
; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_float_param_1];
; PTX-NEXT:    ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
; PTX-NEXT:    st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
  %load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
  store <8 x float> %load, ptr addrspace(1) %out.offset
  ret void
}

define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; PTX-LABEL: ari_64_double(
; PTX:       {
; PTX-NEXT:    .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b64 %rd1, [ari_64_double_param_0];
; PTX-NEXT:    ld.param.b64 %rd2, [ari_64_double_param_1];
; PTX-NEXT:    ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
; PTX-NEXT:    st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
; PTX-NEXT:    ret;
  %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
  %load = load <4 x double>, ptr addrspace(1) %in.offset, !invariant.load !0
  %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
  store <4 x double> %load, ptr addrspace(1) %out.offset
  ret void
}

!0 = !{}
