HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_device_functions.h
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
25
26#include "host_defines.h"
27#include "math_fwd.h"
28
29#if !defined(__HIPCC_RTC__)
30#include <hip/hip_runtime_api.h>
31#include <stddef.h>
32#endif // !defined(__HIPCC_RTC__)
33
34#include <hip/hip_vector_types.h>
36
37#if __HIP_CLANG_ONLY__
38extern "C" __device__ int printf(const char *fmt, ...);
39#else
40template <typename... All>
41static inline __device__ void printf(const char* format, All... all) {}
42#endif // __HIP_CLANG_ONLY__
43
44extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
45
46/*
47Integer Intrinsics
48*/
49
50// integer intrinsic function __poc __clz __ffs __brev
51__device__ static inline unsigned int __popc(unsigned int input) {
52 return __builtin_popcount(input);
53}
54__device__ static inline unsigned int __popcll(unsigned long long int input) {
55 return __builtin_popcountll(input);
56}
57
58__device__ static inline int __clz(int input) {
59 return __ockl_clz_u32((uint)input);
60}
61
62__device__ static inline int __clzll(long long int input) {
63 return __ockl_clz_u64((uint64_t)input);
64}
65
66__device__ static inline unsigned int __ffs(unsigned int input) {
67 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
68}
69
70__device__ static inline unsigned int __ffsll(unsigned long long int input) {
71 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
72}
73
74__device__ static inline unsigned int __ffs(int input) {
75 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
76}
77
78__device__ static inline unsigned int __ffsll(long long int input) {
79 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
80}
81
82// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
83// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
84// If not found, return -1.
85__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
86 uint64_t temp_mask = mask;
87 int32_t temp_offset = offset;
88
89 if (offset == 0) {
90 temp_mask &= (1 << base);
91 temp_offset = 1;
92 }
93 else if (offset < 0) {
94 temp_mask = __builtin_bitreverse64(mask);
95 base = 63 - base;
96 temp_offset = -offset;
97 }
98
99 temp_mask = temp_mask & ((~0ULL) << base);
100 if (__builtin_popcountll(temp_mask) < temp_offset)
101 return -1;
102 int32_t total = 0;
103 for (int i = 0x20; i > 0; i >>= 1) {
104 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
105 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
106 if (pcnt < temp_offset) {
107 temp_mask = temp_mask >> i;
108 temp_offset -= pcnt;
109 total += i;
110 }
111 else {
112 temp_mask = temp_mask_lo;
113 }
114 }
115 if (offset < 0)
116 return 63 - total;
117 else
118 return total;
119}
120
121__device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
122 uint64_t temp_mask = mask;
123 int32_t temp_offset = offset;
124 if (offset == 0) {
125 temp_mask &= (1 << base);
126 temp_offset = 1;
127 }
128 else if (offset < 0) {
129 temp_mask = __builtin_bitreverse64(mask);
130 base = 63 - base;
131 temp_offset = -offset;
132 }
133 temp_mask = temp_mask & ((~0ULL) << base);
134 if (__builtin_popcountll(temp_mask) < temp_offset)
135 return -1;
136 int32_t total = 0;
137 for (int i = 0x20; i > 0; i >>= 1) {
138 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
139 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
140 if (pcnt < temp_offset) {
141 temp_mask = temp_mask >> i;
142 temp_offset -= pcnt;
143 total += i;
144 }
145 else {
146 temp_mask = temp_mask_lo;
147 }
148 }
149 if (offset < 0)
150 return 63 - total;
151 else
152 return total;
153}
154__device__ static inline unsigned int __brev(unsigned int input) {
155 return __builtin_bitreverse32(input);
156}
157
158__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
159 return __builtin_bitreverse64(input);
160}
161
162__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
163 return input == 0 ? -1 : __builtin_ctzl(input);
164}
165
166__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
167 uint32_t offset = src1 & 31;
168 uint32_t width = src2 & 31;
169 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
170}
171
172__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
173 uint64_t offset = src1 & 63;
174 uint64_t width = src2 & 63;
175 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
176}
177
178__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
179 uint32_t offset = src2 & 31;
180 uint32_t width = src3 & 31;
181 uint32_t mask = (1 << width) - 1;
182 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
183}
184
185__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
186 uint64_t offset = src2 & 63;
187 uint64_t width = src3 & 63;
188 uint64_t mask = (1ULL << width) - 1;
189 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
190}
191
192__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
193{
194 uint32_t mask_shift = shift & 31;
195 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
196}
197
198__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
199{
200 uint32_t min_shift = shift >= 32 ? 32 : shift;
201 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
202}
203
204__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
205{
206 return __builtin_amdgcn_alignbit(hi, lo, shift);
207}
208
209__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
210{
211 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
212}
213
214__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
215__device__ static unsigned int __hadd(int x, int y);
216__device__ static int __mul24(int x, int y);
217__device__ static long long int __mul64hi(long long int x, long long int y);
218__device__ static int __mulhi(int x, int y);
219__device__ static int __rhadd(int x, int y);
220__device__ static unsigned int __sad(int x, int y,unsigned int z);
221__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
222__device__ static int __umul24(unsigned int x, unsigned int y);
223__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
224__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
225__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
226__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
227
229 union {
230 unsigned char c[4];
231 unsigned int ui;
232 };
233} __attribute__((aligned(4)));
234
236 union {
237 unsigned int ui[2];
238 unsigned char c[8];
239 };
240} __attribute__((aligned(8)));
241
242__device__
243static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
244 struct uchar2Holder cHoldVal;
245 struct ucharHolder cHoldKey;
246 cHoldKey.ui = s;
247 cHoldVal.ui[0] = x;
248 cHoldVal.ui[1] = y;
249 unsigned int result;
250 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
251 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
252 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
253 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
254 return result;
255}
256
257__device__ static inline unsigned int __hadd(int x, int y) {
258 int z = x + y;
259 int sign = z & 0x8000000;
260 int value = z & 0x7FFFFFFF;
261 return ((value) >> 1 || sign);
262}
263
264__device__ static inline int __mul24(int x, int y) {
265 return __ockl_mul24_i32(x, y);
266}
267
268__device__ static inline long long __mul64hi(long long int x, long long int y) {
269 ulong x0 = (ulong)x & 0xffffffffUL;
270 long x1 = x >> 32;
271 ulong y0 = (ulong)y & 0xffffffffUL;
272 long y1 = y >> 32;
273 ulong z0 = x0*y0;
274 long t = x1*y0 + (z0 >> 32);
275 long z1 = t & 0xffffffffL;
276 long z2 = t >> 32;
277 z1 = x0*y1 + z1;
278 return x1*y1 + z2 + (z1 >> 32);
279}
280
281__device__ static inline int __mulhi(int x, int y) {
282 return __ockl_mul_hi_i32(x, y);
283}
284
285__device__ static inline int __rhadd(int x, int y) {
286 int z = x + y + 1;
287 int sign = z & 0x8000000;
288 int value = z & 0x7FFFFFFF;
289 return ((value) >> 1 || sign);
290}
291__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
292 return x > y ? x - y + z : y - x + z;
293}
294__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
295 return (x + y) >> 1;
296}
297__device__ static inline int __umul24(unsigned int x, unsigned int y) {
298 return __ockl_mul24_u32(x, y);
299}
300
301__device__
302static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
303 ulong x0 = x & 0xffffffffUL;
304 ulong x1 = x >> 32;
305 ulong y0 = y & 0xffffffffUL;
306 ulong y1 = y >> 32;
307 ulong z0 = x0*y0;
308 ulong t = x1*y0 + (z0 >> 32);
309 ulong z1 = t & 0xffffffffUL;
310 ulong z2 = t >> 32;
311 z1 = x0*y1 + z1;
312 return x1*y1 + z2 + (z1 >> 32);
313}
314
315__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
316 return __ockl_mul_hi_u32(x, y);
317}
318__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
319 return (x + y + 1) >> 1;
320}
321__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
322 return __ockl_sadd_u32(x, y, z);
323}
324
325__device__ static inline unsigned int __lane_id() {
326 return __builtin_amdgcn_mbcnt_hi(
327 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
328}
329
330__device__
331static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
332
333__device__
334static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
335
336/*
337HIP specific device functions
338*/
339
340#if !defined(__HIPCC_RTC__)
341#include "amd_warp_functions.h"
342#endif
343
344#define MASK1 0x00ff00ff
345#define MASK2 0xff00ff00
346
347__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
348 char4 out;
349 unsigned one1 = in1.w & MASK1;
350 unsigned one2 = in2.w & MASK1;
351 out.w = (one1 + one2) & MASK1;
352 one1 = in1.w & MASK2;
353 one2 = in2.w & MASK2;
354 out.w = out.w | ((one1 + one2) & MASK2);
355 return out;
356}
357
358__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
359 char4 out;
360 unsigned one1 = in1.w & MASK1;
361 unsigned one2 = in2.w & MASK1;
362 out.w = (one1 - one2) & MASK1;
363 one1 = in1.w & MASK2;
364 one2 = in2.w & MASK2;
365 out.w = out.w | ((one1 - one2) & MASK2);
366 return out;
367}
368
369__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
370 char4 out;
371 unsigned one1 = in1.w & MASK1;
372 unsigned one2 = in2.w & MASK1;
373 out.w = (one1 * one2) & MASK1;
374 one1 = in1.w & MASK2;
375 one2 = in2.w & MASK2;
376 out.w = out.w | ((one1 * one2) & MASK2);
377 return out;
378}
379
380__device__ static inline float __double2float_rd(double x) {
381 return __ocml_cvtrtn_f32_f64(x);
382}
383__device__ static inline float __double2float_rn(double x) { return x; }
384__device__ static inline float __double2float_ru(double x) {
385 return __ocml_cvtrtp_f32_f64(x);
386}
387__device__ static inline float __double2float_rz(double x) {
388 return __ocml_cvtrtz_f32_f64(x);
389}
390
391__device__ static inline int __double2hiint(double x) {
392 static_assert(sizeof(double) == 2 * sizeof(int), "");
393
394 int tmp[2];
395 __builtin_memcpy(tmp, &x, sizeof(tmp));
396
397 return tmp[1];
398}
399__device__ static inline int __double2loint(double x) {
400 static_assert(sizeof(double) == 2 * sizeof(int), "");
401
402 int tmp[2];
403 __builtin_memcpy(tmp, &x, sizeof(tmp));
404
405 return tmp[0];
406}
407
408__device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
409__device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
410__device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
411__device__ static inline int __double2int_rz(double x) { return (int)x; }
412
413__device__ static inline long long int __double2ll_rd(double x) {
414 return (long long)__ocml_floor_f64(x);
415}
416__device__ static inline long long int __double2ll_rn(double x) {
417 return (long long)__ocml_rint_f64(x);
418}
419__device__ static inline long long int __double2ll_ru(double x) {
420 return (long long)__ocml_ceil_f64(x);
421}
422__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
423
424__device__ static inline unsigned int __double2uint_rd(double x) {
425 return (unsigned int)__ocml_floor_f64(x);
426}
427__device__ static inline unsigned int __double2uint_rn(double x) {
428 return (unsigned int)__ocml_rint_f64(x);
429}
430__device__ static inline unsigned int __double2uint_ru(double x) {
431 return (unsigned int)__ocml_ceil_f64(x);
432}
433__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
434
435__device__ static inline unsigned long long int __double2ull_rd(double x) {
436 return (unsigned long long int)__ocml_floor_f64(x);
437}
438__device__ static inline unsigned long long int __double2ull_rn(double x) {
439 return (unsigned long long int)__ocml_rint_f64(x);
440}
441__device__ static inline unsigned long long int __double2ull_ru(double x) {
442 return (unsigned long long int)__ocml_ceil_f64(x);
443}
444__device__ static inline unsigned long long int __double2ull_rz(double x) {
445 return (unsigned long long int)x;
446}
447#if defined(__clang__)
448#pragma clang diagnostic push
449#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
450#endif
451__device__ static inline long long int __double_as_longlong(double x) {
452 static_assert(sizeof(long long) == sizeof(double), "");
453
454 long long tmp;
455 __builtin_memcpy(&tmp, &x, sizeof(tmp));
456
457 return tmp;
458}
459#if defined(__clang__)
460#pragma clang diagnostic pop
461#endif
462
463/*
464__device__ unsigned short __float2half_rn(float x);
465__device__ float __half2float(unsigned short);
466
467The above device function are not a valid .
468Use
469__device__ __half __float2half_rn(float x);
470__device__ float __half2float(__half);
471from hip_fp16.h
472
473CUDA implements half as unsigned short whereas, HIP doesn't.
474
475*/
476
477__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
478__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
479__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
480__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
481
482__device__ static inline long long int __float2ll_rd(float x) {
483 return (long long int)__ocml_floor_f32(x);
484}
485__device__ static inline long long int __float2ll_rn(float x) {
486 return (long long int)__ocml_rint_f32(x);
487}
488__device__ static inline long long int __float2ll_ru(float x) {
489 return (long long int)__ocml_ceil_f32(x);
490}
491__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
492
493__device__ static inline unsigned int __float2uint_rd(float x) {
494 return (unsigned int)__ocml_floor_f32(x);
495}
496__device__ static inline unsigned int __float2uint_rn(float x) {
497 return (unsigned int)__ocml_rint_f32(x);
498}
499__device__ static inline unsigned int __float2uint_ru(float x) {
500 return (unsigned int)__ocml_ceil_f32(x);
501}
502__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
503
504__device__ static inline unsigned long long int __float2ull_rd(float x) {
505 return (unsigned long long int)__ocml_floor_f32(x);
506}
507__device__ static inline unsigned long long int __float2ull_rn(float x) {
508 return (unsigned long long int)__ocml_rint_f32(x);
509}
510__device__ static inline unsigned long long int __float2ull_ru(float x) {
511 return (unsigned long long int)__ocml_ceil_f32(x);
512}
513__device__ static inline unsigned long long int __float2ull_rz(float x) {
514 return (unsigned long long int)x;
515}
516
517__device__ static inline int __float_as_int(float x) {
518 static_assert(sizeof(int) == sizeof(float), "");
519
520 int tmp;
521 __builtin_memcpy(&tmp, &x, sizeof(tmp));
522
523 return tmp;
524}
525
526__device__ static inline unsigned int __float_as_uint(float x) {
527 static_assert(sizeof(unsigned int) == sizeof(float), "");
528
529 unsigned int tmp;
530 __builtin_memcpy(&tmp, &x, sizeof(tmp));
531
532 return tmp;
533}
534
535__device__ static inline double __hiloint2double(int hi, int lo) {
536 static_assert(sizeof(double) == sizeof(uint64_t), "");
537
538 uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
539 double tmp1;
540 __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
541
542 return tmp1;
543}
544
545__device__ static inline double __int2double_rn(int x) { return (double)x; }
546
547__device__ static inline float __int2float_rd(int x) {
548 return __ocml_cvtrtn_f32_s32(x);
549}
550__device__ static inline float __int2float_rn(int x) { return (float)x; }
551__device__ static inline float __int2float_ru(int x) {
552 return __ocml_cvtrtp_f32_s32(x);
553}
554__device__ static inline float __int2float_rz(int x) {
555 return __ocml_cvtrtz_f32_s32(x);
556}
557
558__device__ static inline float __int_as_float(int x) {
559 static_assert(sizeof(float) == sizeof(int), "");
560
561 float tmp;
562 __builtin_memcpy(&tmp, &x, sizeof(tmp));
563
564 return tmp;
565}
566
567__device__ static inline double __ll2double_rd(long long int x) {
568 return __ocml_cvtrtn_f64_s64(x);
569}
570__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
571__device__ static inline double __ll2double_ru(long long int x) {
572 return __ocml_cvtrtp_f64_s64(x);
573}
574__device__ static inline double __ll2double_rz(long long int x) {
575 return __ocml_cvtrtz_f64_s64(x);
576}
577
578__device__ static inline float __ll2float_rd(long long int x) {
579 return __ocml_cvtrtn_f32_s64(x);
580}
581__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
582__device__ static inline float __ll2float_ru(long long int x) {
583 return __ocml_cvtrtp_f32_s64(x);
584}
585__device__ static inline float __ll2float_rz(long long int x) {
586 return __ocml_cvtrtz_f32_s64(x);
587}
588
589__device__ static inline double __longlong_as_double(long long int x) {
590 static_assert(sizeof(double) == sizeof(long long), "");
591
592 double tmp;
593 __builtin_memcpy(&tmp, &x, sizeof(tmp));
594
595 return tmp;
596}
597
598__device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
599
600__device__ static inline float __uint2float_rd(unsigned int x) {
601 return __ocml_cvtrtn_f32_u32(x);
602}
603__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
604__device__ static inline float __uint2float_ru(unsigned int x) {
605 return __ocml_cvtrtp_f32_u32(x);
606}
607__device__ static inline float __uint2float_rz(unsigned int x) {
608 return __ocml_cvtrtz_f32_u32(x);
609}
610
611__device__ static inline float __uint_as_float(unsigned int x) {
612 static_assert(sizeof(float) == sizeof(unsigned int), "");
613
614 float tmp;
615 __builtin_memcpy(&tmp, &x, sizeof(tmp));
616
617 return tmp;
618}
619
620__device__ static inline double __ull2double_rd(unsigned long long int x) {
621 return __ocml_cvtrtn_f64_u64(x);
622}
623__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
624__device__ static inline double __ull2double_ru(unsigned long long int x) {
625 return __ocml_cvtrtp_f64_u64(x);
626}
627__device__ static inline double __ull2double_rz(unsigned long long int x) {
628 return __ocml_cvtrtz_f64_u64(x);
629}
630
631__device__ static inline float __ull2float_rd(unsigned long long int x) {
632 return __ocml_cvtrtn_f32_u64(x);
633}
634__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
635__device__ static inline float __ull2float_ru(unsigned long long int x) {
636 return __ocml_cvtrtp_f32_u64(x);
637}
638__device__ static inline float __ull2float_rz(unsigned long long int x) {
639 return __ocml_cvtrtz_f32_u64(x);
640}
641
642#if __HIP_CLANG_ONLY__
643
644// Clock functions
645__device__ long long int __clock64();
646__device__ long long int __clock();
647__device__ long long int clock64();
648__device__ long long int clock();
649__device__ long long int wall_clock64();
650// hip.amdgcn.bc - named sync
651__device__ void __named_sync();
652
653#ifdef __HIP_DEVICE_COMPILE__
654
655// Clock function to return GPU core cycle count.
656// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
657// through hipDeviceAttributeClockRate attribute.
658__device__
659inline __attribute((always_inline))
660long long int __clock64() {
661#if __has_builtin(__builtin_amdgcn_s_memtime)
662 // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
663 return (long long int) __builtin_amdgcn_s_memtime();
664#else
665 // Subject to change when better solution available
666 return (long long int) __builtin_readcyclecounter();
667#endif
668}
669
670__device__
671inline __attribute((always_inline))
672long long int __clock() { return __clock64(); }
673
674// Clock function to return wall clock count at a constant frequency that can be queried
675// through hipDeviceAttributeWallClockRate attribute.
676__device__
677inline __attribute__((always_inline))
678long long int wall_clock64() {
679 return (long long int) __ockl_steadyctr_u64();
680}
681
682__device__
683inline __attribute__((always_inline))
684long long int clock64() { return __clock64(); }
685
686__device__
687inline __attribute__((always_inline))
688long long int clock() { return __clock(); }
689
690// hip.amdgcn.bc - named sync
691__device__
692inline
693void __named_sync() { __builtin_amdgcn_s_barrier(); }
694
695#endif // __HIP_DEVICE_COMPILE__
696
697// warp vote function __all __any __ballot
698__device__
699inline
700int __all(int predicate) {
701 return __ockl_wfall_i32(predicate);
702}
703
704__device__
705inline
706int __any(int predicate) {
707 return __ockl_wfany_i32(predicate);
708}
709
710// XXX from llvm/include/llvm/IR/InstrTypes.h
711#define ICMP_NE 33
712
713__device__
714inline
715unsigned long long int __ballot(int predicate) {
716 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
717}
718
719__device__
720inline
721unsigned long long int __ballot64(int predicate) {
722 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
723}
724
725// hip.amdgcn.bc - lanemask
726__device__
727inline
728uint64_t __lanemask_gt()
729{
730 uint32_t lane = __ockl_lane_u32();
731 if (lane == 63)
732 return 0;
733 uint64_t ballot = __ballot64(1);
734 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
735 return mask & ballot;
736}
737
738__device__
739inline
740uint64_t __lanemask_lt()
741{
742 uint32_t lane = __ockl_lane_u32();
743 int64_t ballot = __ballot64(1);
744 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
745 return mask & ballot;
746}
747
748__device__
749inline
750uint64_t __lanemask_eq()
751{
752 uint32_t lane = __ockl_lane_u32();
753 int64_t mask = ((uint64_t)1 << lane);
754 return mask;
755}
756
757
758__device__ inline void* __local_to_generic(void* p) { return p; }
759
760#ifdef __HIP_DEVICE_COMPILE__
761__device__
762inline
763void* __get_dynamicgroupbaseptr()
764{
765 // Get group segment base pointer.
766 return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
767}
768#else
769__device__
770void* __get_dynamicgroupbaseptr();
771#endif // __HIP_DEVICE_COMPILE__
772
773__device__
774inline
775void *__amdgcn_get_dynamicgroupbaseptr() {
776 return __get_dynamicgroupbaseptr();
777}
778
779// Memory Fence Functions
780__device__
781inline
782static void __threadfence()
783{
784 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
785}
786
787__device__
788inline
789static void __threadfence_block()
790{
791 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
792}
793
794__device__
795inline
796static void __threadfence_system()
797{
798 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
799}
800
801// abort
802__device__
803inline
804__attribute__((weak))
805void abort() {
806 return __builtin_trap();
807}
808
809// The noinline attribute helps encapsulate the printf expansion,
810// which otherwise has a performance impact just by increasing the
811// size of the calling function. Additionally, the weak attribute
812// allows the function to exist as a global although its definition is
813// included in every compilation unit.
814#if defined(_WIN32) || defined(_WIN64)
815extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
816void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
817 // FIXME: Need `wchar_t` support to generate assertion message.
818 __builtin_trap();
819}
820#else /* defined(_WIN32) || defined(_WIN64) */
821extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
822void __assert_fail(const char *assertion,
823 const char *file,
824 unsigned int line,
825 const char *function)
826{
827 const char fmt[] = "%s:%u: %s: Device-side assertion `%s' failed.\n";
828
829 // strlen is not available as a built-in yet, so we create our own
830 // loop in a macro. With a string literal argument, the compiler
831 // usually manages to replace the loop with a constant.
832 //
833 // The macro does not check for null pointer, since all the string
834 // arguments are defined to be constant literals when called from
835 // the assert() macro.
836 //
837 // NOTE: The loop below includes the null terminator in the length
838 // as required by append_string_n().
839#define __hip_get_string_length(LEN, STR) \
840 do { \
841 const char *tmp = STR; \
842 while (*tmp++); \
843 LEN = tmp - STR; \
844 } while (0)
845
846 auto msg = __ockl_fprintf_stderr_begin();
847 int len = 0;
848 __hip_get_string_length(len, fmt);
849 msg = __ockl_fprintf_append_string_n(msg, fmt, len, 0);
850 __hip_get_string_length(len, file);
851 msg = __ockl_fprintf_append_string_n(msg, file, len, 0);
852 msg = __ockl_fprintf_append_args(msg, 1, line, 0, 0, 0, 0, 0, 0, 0);
853 __hip_get_string_length(len, function);
854 msg = __ockl_fprintf_append_string_n(msg, function, len, 0);
855 __hip_get_string_length(len, assertion);
856 __ockl_fprintf_append_string_n(msg, assertion, len, /* is_last = */ 1);
857
858#undef __hip_get_string_length
859
860 __builtin_trap();
861}
862
863extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
864void __assertfail()
865{
866 // ignore all the args for now.
867 __builtin_trap();
868}
869#endif /* defined(_WIN32) || defined(_WIN64) */
870
871__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
872 if (flags) {
873 __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
874 __builtin_amdgcn_s_barrier();
875 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
876 } else {
877 __builtin_amdgcn_s_barrier();
878 }
879}
880
881__device__
882inline
883static void __barrier(int n)
884{
885 __work_group_barrier((__cl_mem_fence_flags)n);
886}
887
888__device__
889inline
890__attribute__((convergent))
891void __syncthreads()
892{
893 __barrier(__CLK_LOCAL_MEM_FENCE);
894}
895
896__device__
897inline
898__attribute__((convergent))
899int __syncthreads_count(int predicate)
900{
901 return __ockl_wgred_add_i32(!!predicate);
902}
903
904__device__
905inline
906__attribute__((convergent))
907int __syncthreads_and(int predicate)
908{
909 return __ockl_wgred_and_i32(!!predicate);
910}
911
912__device__
913inline
914__attribute__((convergent))
915int __syncthreads_or(int predicate)
916{
917 return __ockl_wgred_or_i32(!!predicate);
918}
919
920// hip.amdgcn.bc - device routine
921/*
922 HW_ID Register bit structure for RDNA2 & RDNA3
923 WAVE_ID 4:0 Wave id within the SIMD.
924 SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
925 WGP_ID 13:10 Physical WGP ID.
926 SA_ID 16 Shader Array ID
927 SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
928 SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
929 DP_RATE 31:29 Number of double-precision float units per SIMD
930
931 HW_ID Register bit structure for GCN and CDNA
932 WAVE_ID 3:0 Wave buffer slot number. 0-9.
933 SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
934 PIPE_ID 7:6 Pipeline from which the wave was dispatched.
935 CU_ID 11:8 Compute Unit the wave is assigned to.
936 SH_ID 12 Shader Array (within an SE) the wave is assigned to.
937 SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940
938 14:13 Shader Engine the wave is assigned to for Vega.
939 TG_ID 19:16 Thread-group ID
940 VM_ID 23:20 Virtual Memory ID
941 QUEUE_ID 26:24 Queue from which this wave was dispatched.
942 STATE_ID 29:27 State ID (graphics only, not compute).
943 ME_ID 31:30 Micro-engine ID.
944
945 XCC_ID Register bit structure for gfx940
946 XCC_ID 3:0 XCC the wave is assigned to.
947 */
948
949#if (defined (__GFX10__) || defined (__GFX11__))
950 #define HW_ID 23
951#else
952 #define HW_ID 4
953#endif
954
955#if (defined(__GFX10__) || defined(__GFX11__))
956 #define HW_ID_WGP_ID_SIZE 4
957 #define HW_ID_WGP_ID_OFFSET 10
958#else
959 #define HW_ID_CU_ID_SIZE 4
960 #define HW_ID_CU_ID_OFFSET 8
961#endif
962
963#if (defined(__gfx908__) || defined(__gfx90a__) || \
964 defined(__GFX11__))
965 #define HW_ID_SE_ID_SIZE 3
966#else //4 SEs/XCC for gfx940
967 #define HW_ID_SE_ID_SIZE 2
968#endif
969#if (defined(__GFX10__) || defined(__GFX11__))
970 #define HW_ID_SE_ID_OFFSET 18
971 #define HW_ID_SA_ID_OFFSET 16
972 #define HW_ID_SA_ID_SIZE 1
973#else
974 #define HW_ID_SE_ID_OFFSET 13
975#endif
976
977#if (defined(__gfx940__))
978 #define XCC_ID 20
979 #define XCC_ID_XCC_ID_SIZE 4
980 #define XCC_ID_XCC_ID_OFFSET 0
981#endif
982
983#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
984 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
985 #define __HIP_NO_IMAGE_SUPPORT 1
986#endif
987
988/*
989 Encoding of parameter bitmask
990 HW_ID 5:0 HW_ID
991 OFFSET 10:6 Range: 0..31
992 SIZE 15:11 Range: 1..32
993 */
994
995#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
996
997/*
998 __smid returns the wave's assigned Compute Unit and Shader Engine.
999 The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
1000 Note: the results vary over time.
1001 SZ minus 1 since SIZE is 1-based.
1002*/
1003__device__
1004inline
1005unsigned __smid(void)
1006{
1007 unsigned se_id = __builtin_amdgcn_s_getreg(
1008 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1009 #if (defined(__GFX10__) || defined(__GFX11__))
1010 unsigned wgp_id = __builtin_amdgcn_s_getreg(
1011 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
1012 unsigned sa_id = __builtin_amdgcn_s_getreg(
1013 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
1014 #else
1015 #if defined(__gfx940__)
1016 unsigned xcc_id = __builtin_amdgcn_s_getreg(
1017 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
1018 #endif
1019 unsigned cu_id = __builtin_amdgcn_s_getreg(
1020 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
1021 #endif
1022 #if (defined(__GFX10__) || defined(__GFX11__))
1023 unsigned temp = se_id;
1024 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
1025 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
1026 return temp;
1027 //TODO : CU Mode impl
1028 #elif defined(__gfx940__)
1029 unsigned temp = xcc_id;
1030 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
1031 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
1032 return temp;
1033 #else
1034 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1035 #endif
1036}
1037
1042#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
1043#define HIP_DYNAMIC_SHARED_ATTRIBUTE
1044
1045#endif //defined(__clang__) && defined(__HIP__)
1046
1047
1048// loop unrolling
1049static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
1050 auto dstPtr = static_cast<unsigned char*>(dst);
1051 auto srcPtr = static_cast<const unsigned char*>(src);
1052
1053 while (size >= 4u) {
1054 dstPtr[0] = srcPtr[0];
1055 dstPtr[1] = srcPtr[1];
1056 dstPtr[2] = srcPtr[2];
1057 dstPtr[3] = srcPtr[3];
1058
1059 size -= 4u;
1060 srcPtr += 4u;
1061 dstPtr += 4u;
1062 }
1063 switch (size) {
1064 case 3:
1065 dstPtr[2] = srcPtr[2];
1066 case 2:
1067 dstPtr[1] = srcPtr[1];
1068 case 1:
1069 dstPtr[0] = srcPtr[0];
1070 }
1071
1072 return dst;
1073}
1074
1075static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1076 auto dstPtr = static_cast<unsigned char*>(dst);
1077
1078 while (size >= 4u) {
1079 dstPtr[0] = val;
1080 dstPtr[1] = val;
1081 dstPtr[2] = val;
1082 dstPtr[3] = val;
1083
1084 size -= 4u;
1085 dstPtr += 4u;
1086 }
1087 switch (size) {
1088 case 3:
1089 dstPtr[2] = val;
1090 case 2:
1091 dstPtr[1] = val;
1092 case 1:
1093 dstPtr[0] = val;
1094 }
1095
1096 return dst;
1097}
1098#ifndef __OPENMP_AMDGCN__
1099static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1100 return __hip_hc_memcpy(dst, src, size);
1101}
1102
1103static inline __device__ void* memset(void* ptr, int val, size_t size) {
1104 unsigned char val8 = static_cast<unsigned char>(val);
1105 return __hip_hc_memset(ptr, val8, size);
1106}
1107#endif // !__OPENMP_AMDGCN__
1108#endif
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
Definition amd_device_functions.h:228
Definition amd_device_functions.h:235
Definition amd_hip_vector_types.h:1623