99// RUN: -fcuda-is-device -emit-llvm %s -o %t.ll
1010// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
1111
12+ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -std=c++11 \
13+ // RUN: -fcuda-is-device -emit-llvm %s -o %t.ll
14+ // RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
15+
1216// ===----------------------------------------------------------------------===//
13- // Test AMDGPU built-in functions
17+ // Test AMDGPU builtins
1418// ===----------------------------------------------------------------------===//
1519
1620// CIR-LABEL: @_Z28test_wave_reduce_add_u32_i32Pi
1721// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.add" {{.*}} : (!u32i, !s32i) -> !u32i
1822// LLVM: define{{.*}} void @_Z28test_wave_reduce_add_u32_i32Pii(
1923// LLVM: call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %{{.*}}, i32 0)
24+ // OGCG: define{{.*}} void @_Z28test_wave_reduce_add_u32_i32Pii(
25+ // OGCG: call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %{{.*}}, i32 0)
2026__device__ void test_wave_reduce_add_u32_i32 (int * out, int in) {
2127 *out = __builtin_amdgcn_wave_reduce_add_u32 (in, 0 );
2228}
@@ -25,6 +31,8 @@ __device__ void test_wave_reduce_add_u32_i32(int* out, int in) {
2531// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.add" {{.*}} : (!u64i, !s32i) -> !u64i
2632// LLVM: define{{.*}} void @_Z28test_wave_reduce_add_u64_i64Pll(
2733// LLVM: call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %{{.*}}, i32 0)
34+ // OGCG: define{{.*}} void @_Z28test_wave_reduce_add_u64_i64Pll(
35+ // OGCG: call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %{{.*}}, i32 0)
2836__device__ void test_wave_reduce_add_u64_i64 (long * out, long in) {
2937 *out = __builtin_amdgcn_wave_reduce_add_u64 (in, 0 );
3038}
@@ -33,6 +41,8 @@ __device__ void test_wave_reduce_add_u64_i64(long* out, long in) {
3341// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.sub" {{.*}} : (!u32i, !s32i) -> !u32i
3442// LLVM: define{{.*}} void @_Z28test_wave_reduce_sub_u32_i32Pii(
3543// LLVM: call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %{{.*}}, i32 0)
44+ // OGCG: define{{.*}} void @_Z28test_wave_reduce_sub_u32_i32Pii(
45+ // OGCG: call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %{{.*}}, i32 0)
3646__device__ void test_wave_reduce_sub_u32_i32 (int * out, int in) {
3747 *out = __builtin_amdgcn_wave_reduce_sub_u32 (in, 0 );
3848}
@@ -41,6 +51,8 @@ __device__ void test_wave_reduce_sub_u32_i32(int* out, int in) {
4151// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.sub" {{.*}} : (!u64i, !s32i) -> !u64i
4252// LLVM: define{{.*}} void @_Z28test_wave_reduce_sub_u64_i64Pll(
4353// LLVM: call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %{{.*}}, i32 0)
54+ // OGCG: define{{.*}} void @_Z28test_wave_reduce_sub_u64_i64Pll(
55+ // OGCG: call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %{{.*}}, i32 0)
4456__device__ void test_wave_reduce_sub_u64_i64 (long * out, long in) {
4557 *out = __builtin_amdgcn_wave_reduce_sub_u64 (in, 0 );
4658}
@@ -49,6 +61,8 @@ __device__ void test_wave_reduce_sub_u64_i64(long* out, long in) {
4961// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.min" {{.*}} : (!s32i, !s32i) -> !s32i
5062// LLVM: define{{.*}} void @_Z29test_wave_reduce_min_i32_signPii(
5163// LLVM: call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %{{.*}}, i32 0)
64+ // OGCG: define{{.*}} void @_Z29test_wave_reduce_min_i32_signPii(
65+ // OGCG: call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %{{.*}}, i32 0)
5266__device__ void test_wave_reduce_min_i32_sign (int * out, int in) {
5367 *out = __builtin_amdgcn_wave_reduce_min_i32 (in, 0 );
5468}
@@ -57,6 +71,8 @@ __device__ void test_wave_reduce_min_i32_sign(int* out, int in) {
5771// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.umin" {{.*}} : (!u32i, !s32i) -> !u32i
5872// LLVM: define{{.*}} void @_Z31test_wave_reduce_min_u32_unsignPjj(
5973// LLVM: call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %{{.*}}, i32 0)
74+ // OGCG: define{{.*}} void @_Z31test_wave_reduce_min_u32_unsignPjj(
75+ // OGCG: call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %{{.*}}, i32 0)
6076__device__ void test_wave_reduce_min_u32_unsign (unsigned int * out, unsigned int in) {
6177 *out = __builtin_amdgcn_wave_reduce_min_u32 (in, 0 );
6278}
@@ -65,6 +81,8 @@ __device__ void test_wave_reduce_min_u32_unsign(unsigned int* out, unsigned int
6581// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.min" {{.*}} : (!s64i, !s32i) -> !s64i
6682// LLVM: define{{.*}} void @_Z29test_wave_reduce_min_i64_signPll(
6783// LLVM: call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 %{{.*}}, i32 0)
84+ // OGCG: define{{.*}} void @_Z29test_wave_reduce_min_i64_signPll(
85+ // OGCG: call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 %{{.*}}, i32 0)
6886__device__ void test_wave_reduce_min_i64_sign (long * out, long in) {
6987 *out = __builtin_amdgcn_wave_reduce_min_i64 (in, 0 );
7088}
@@ -73,6 +91,8 @@ __device__ void test_wave_reduce_min_i64_sign(long* out, long in) {
7391// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.umin" {{.*}} : (!u64i, !s32i) -> !u64i
7492// LLVM: define{{.*}} void @_Z31test_wave_reduce_min_u64_unsignPmm(
7593// LLVM: call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 %{{.*}}, i32 0)
94+ // OGCG: define{{.*}} void @_Z31test_wave_reduce_min_u64_unsignPmm(
95+ // OGCG: call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 %{{.*}}, i32 0)
7696__device__ void test_wave_reduce_min_u64_unsign (unsigned long * out, unsigned long in) {
7797 *out = __builtin_amdgcn_wave_reduce_min_u64 (in, 0 );
7898}
@@ -81,6 +101,8 @@ __device__ void test_wave_reduce_min_u64_unsign(unsigned long* out, unsigned lon
81101// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.max" {{.*}} : (!s32i, !s32i) -> !s32i
82102// LLVM: define{{.*}} void @_Z29test_wave_reduce_max_i32_signPii(
83103// LLVM: call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %{{.*}}, i32 0)
104+ // OGCG: define{{.*}} void @_Z29test_wave_reduce_max_i32_signPii(
105+ // OGCG: call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %{{.*}}, i32 0)
84106__device__ void test_wave_reduce_max_i32_sign (int * out, int in) {
85107 *out = __builtin_amdgcn_wave_reduce_max_i32 (in, 0 );
86108}
@@ -89,6 +111,8 @@ __device__ void test_wave_reduce_max_i32_sign(int* out, int in) {
89111// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.umax" {{.*}} : (!u32i, !s32i) -> !u32i
90112// LLVM: define{{.*}} void @_Z31test_wave_reduce_max_u32_unsignPjj(
91113// LLVM: call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %{{.*}}, i32 0)
114+ // OGCG: define{{.*}} void @_Z31test_wave_reduce_max_u32_unsignPjj(
115+ // OGCG: call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %{{.*}}, i32 0)
92116__device__ void test_wave_reduce_max_u32_unsign (unsigned int * out, unsigned int in) {
93117 *out = __builtin_amdgcn_wave_reduce_max_u32 (in, 0 );
94118}
@@ -97,6 +121,8 @@ __device__ void test_wave_reduce_max_u32_unsign(unsigned int* out, unsigned int
97121// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.max" {{.*}} : (!s64i, !s32i) -> !s64i
98122// LLVM: define{{.*}} void @_Z29test_wave_reduce_max_i64_signPll(
99123// LLVM: call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 %{{.*}}, i32 0)
124+ // OGCG: define{{.*}} void @_Z29test_wave_reduce_max_i64_signPll(
125+ // OGCG: call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 %{{.*}}, i32 0)
100126__device__ void test_wave_reduce_max_i64_sign (long * out, long in) {
101127 *out = __builtin_amdgcn_wave_reduce_max_i64 (in, 0 );
102128}
@@ -105,6 +131,8 @@ __device__ void test_wave_reduce_max_i64_sign(long* out, long in) {
105131// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.umax" {{.*}} : (!u64i, !s32i) -> !u64i
106132// LLVM: define{{.*}} void @_Z31test_wave_reduce_max_u64_unsignPmm(
107133// LLVM: call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 %{{.*}}, i32 0)
134+ // OGCG: define{{.*}} void @_Z31test_wave_reduce_max_u64_unsignPmm(
135+ // OGCG: call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 %{{.*}}, i32 0)
108136__device__ void test_wave_reduce_max_u64_unsign (unsigned long * out, unsigned long in) {
109137 *out = __builtin_amdgcn_wave_reduce_max_u64 (in, 0 );
110138}
@@ -113,6 +141,8 @@ __device__ void test_wave_reduce_max_u64_unsign(unsigned long* out, unsigned lon
113141// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.and" {{.*}} : (!s32i, !s32i) -> !s32i
114142// LLVM: define{{.*}} void @_Z28test_wave_reduce_and_b32_i32Pii(
115143// LLVM: call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %{{.*}}, i32 0)
144+ // OGCG: define{{.*}} void @_Z28test_wave_reduce_and_b32_i32Pii(
145+ // OGCG: call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %{{.*}}, i32 0)
116146__device__ void test_wave_reduce_and_b32_i32 (int * out, int in) {
117147 *out = __builtin_amdgcn_wave_reduce_and_b32 (in, 0 );
118148}
@@ -121,6 +151,8 @@ __device__ void test_wave_reduce_and_b32_i32(int* out, int in) {
121151// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.and" {{.*}} : (!s64i, !s32i) -> !s64i
122152// LLVM: define{{.*}} void @_Z28test_wave_reduce_and_b64_i64Pll(
123153// LLVM: call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 %{{.*}}, i32 0)
154+ // OGCG: define{{.*}} void @_Z28test_wave_reduce_and_b64_i64Pll(
155+ // OGCG: call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 %{{.*}}, i32 0)
124156__device__ void test_wave_reduce_and_b64_i64 (long * out, long in) {
125157 *out = __builtin_amdgcn_wave_reduce_and_b64 (in, 0 );
126158}
@@ -129,6 +161,8 @@ __device__ void test_wave_reduce_and_b64_i64(long* out, long in) {
129161// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.or" {{.*}} : (!s32i, !s32i) -> !s32i
130162// LLVM: define{{.*}} void @_Z27test_wave_reduce_or_b32_i32Pii(
131163// LLVM: call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %{{.*}}, i32 0)
164+ // OGCG: define{{.*}} void @_Z27test_wave_reduce_or_b32_i32Pii(
165+ // OGCG: call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %{{.*}}, i32 0)
132166__device__ void test_wave_reduce_or_b32_i32 (int * out, int in) {
133167 *out = __builtin_amdgcn_wave_reduce_or_b32 (in, 0 );
134168}
@@ -137,6 +171,8 @@ __device__ void test_wave_reduce_or_b32_i32(int* out, int in) {
137171// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.or" {{.*}} : (!s64i, !s32i) -> !s64i
138172// LLVM: define{{.*}} void @_Z27test_wave_reduce_or_b64_i64Pll(
139173// LLVM: call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 %{{.*}}, i32 0)
174+ // OGCG: define{{.*}} void @_Z27test_wave_reduce_or_b64_i64Pll(
175+ // OGCG: call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 %{{.*}}, i32 0)
140176__device__ void test_wave_reduce_or_b64_i64 (long * out, long in) {
141177 *out = __builtin_amdgcn_wave_reduce_or_b64 (in, 0 );
142178}
@@ -145,6 +181,8 @@ __device__ void test_wave_reduce_or_b64_i64(long* out, long in) {
145181// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.xor" {{.*}} : (!s32i, !s32i) -> !s32i
146182// LLVM: define{{.*}} void @_Z28test_wave_reduce_xor_b32_i32Pii(
147183// LLVM: call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %{{.*}}, i32 0)
184+ // OGCG: define{{.*}} void @_Z28test_wave_reduce_xor_b32_i32Pii(
185+ // OGCG: call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %{{.*}}, i32 0)
148186__device__ void test_wave_reduce_xor_b32_i32 (int * out, int in) {
149187 *out = __builtin_amdgcn_wave_reduce_xor_b32 (in, 0 );
150188}
@@ -153,6 +191,8 @@ __device__ void test_wave_reduce_xor_b32_i32(int* out, int in) {
153191// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.xor" {{.*}} : (!s64i, !s32i) -> !s64i
154192// LLVM: define{{.*}} void @_Z28test_wave_reduce_xor_b64_i64Pll(
155193// LLVM: call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %{{.*}}, i32 0)
194+ // OGCG: define{{.*}} void @_Z28test_wave_reduce_xor_b64_i64Pll(
195+ // OGCG: call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %{{.*}}, i32 0)
156196__device__ void test_wave_reduce_xor_b64_i64 (long * out, long in) {
157197 *out = __builtin_amdgcn_wave_reduce_xor_b64 (in, 0 );
158198}
@@ -162,6 +202,8 @@ __device__ void test_wave_reduce_xor_b64_i64(long* out, long in) {
162202// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.add" {{.*}} : (!u32i, !s32i) -> !u32i
163203// LLVM: define{{.*}} void @_Z38test_wave_reduce_add_u32_iterative_i32Pii(
164204// LLVM: call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %{{.*}}, i32 1)
205+ // OGCG: define{{.*}} void @_Z38test_wave_reduce_add_u32_iterative_i32Pii(
206+ // OGCG: call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %{{.*}}, i32 1)
165207__device__ void test_wave_reduce_add_u32_iterative_i32 (int * out, int in) {
166208 *out = __builtin_amdgcn_wave_reduce_add_u32 (in, 1 );
167209}
@@ -171,6 +213,8 @@ __device__ void test_wave_reduce_add_u32_iterative_i32(int* out, int in) {
171213// CIR: cir.llvm.intrinsic "amdgcn.wave.reduce.add" {{.*}} : (!u32i, !s32i) -> !u32i
172214// LLVM: define{{.*}} void @_Z32test_wave_reduce_add_u32_dpp_i32Pii(
173215// LLVM: call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %{{.*}}, i32 2)
216+ // OGCG: define{{.*}} void @_Z32test_wave_reduce_add_u32_dpp_i32Pii(
217+ // OGCG: call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %{{.*}}, i32 2)
174218__device__ void test_wave_reduce_add_u32_dpp_i32 (int * out, int in) {
175219 *out = __builtin_amdgcn_wave_reduce_add_u32 (in, 2 );
176220}
0 commit comments