21 void x(
volatile T& a, T b)
const
43 void x(
volatile T& a, T b)
const
49template <
class T,
unsigned int B,
class Op>
56 if (B >= 64) sd[t] = op(sd[t], sd[t + 32]);
57 if (B >= 32) sd[t] = op(sd[t], sd[t + 16]);
58 if (B >= 16) sd[t] = op(sd[t], sd[t + 8 ]);
59 if (B >= 8) sd[t] = op(sd[t], sd[t + 4 ]);
60 if (B >= 4) sd[t] = op(sd[t], sd[t + 2 ]);
61 if (B >= 2) sd[t] = op(sd[t], sd[t + 1 ]);
64 if (B >= 64) { var=sd[t+32];__syncwarp(); op.x(sd[t],var);__syncwarp(); }
65 if (B >= 32) { var=sd[t+16];__syncwarp(); op.x(sd[t],var);__syncwarp(); }
66 if (B >= 16) { var=sd[t+8 ];__syncwarp(); op.x(sd[t],var);__syncwarp(); }
67 if (B >= 8) { var=sd[t+4 ];__syncwarp(); op.x(sd[t],var);__syncwarp(); }
68 if (B >= 4) { var=sd[t+2 ];__syncwarp(); op.x(sd[t],var);__syncwarp(); }
69 if (B >= 2) { var=sd[t+1 ];__syncwarp(); op.x(sd[t],var);__syncwarp(); }
74template <
class T,
unsigned int HN,
unsigned int B,
class Op>
76inline void warp_reduce2(
volatile T (*sd)[B],
unsigned int t, Op op)
81 if (B >= 64) _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) sd[j][t] = op(sd[j][t], sd[j][t + 32]);
82 if (B >= 32) _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) sd[j][t] = op(sd[j][t], sd[j][t + 16]);
83 if (B >= 16) _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) sd[j][t] = op(sd[j][t], sd[j][t + 8 ]);
84 if (B >= 8) _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) sd[j][t] = op(sd[j][t], sd[j][t + 4 ]);
85 if (B >= 4) _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) sd[j][t] = op(sd[j][t], sd[j][t + 2 ]);
86 if (B >= 2) _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) sd[j][t] = op(sd[j][t], sd[j][t + 1 ]);
89 if (B >= 64) _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) { var=sd[j][t+32];__syncwarp(); op.x(sd[j][t],var);__syncwarp(); }
90 if (B >= 32) _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) { var=sd[j][t+16];__syncwarp(); op.x(sd[j][t],var);__syncwarp(); }
91 if (B >= 16) _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) { var=sd[j][t+8 ];__syncwarp(); op.x(sd[j][t],var);__syncwarp(); }
92 if (B >= 8) _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) { var=sd[j][t+4 ];__syncwarp(); op.x(sd[j][t],var);__syncwarp(); }
93 if (B >= 4) _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) { var=sd[j][t+2 ];__syncwarp(); op.x(sd[j][t],var);__syncwarp(); }
94 if (B >= 2) _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) { var=sd[j][t+1 ];__syncwarp(); op.x(sd[j][t],var);__syncwarp(); }
99template <
class T,
unsigned int B,
class Op>
101void reduce(T* g_odata,
const T* g_idata,
size_t n, Op op = Op())
104 unsigned int t = threadIdx.x;
106 for (
int i = t + blockIdx.x * B; i <
n; i += B * gridDim.x) {
107 sd[t] = op(sd[t], g_idata[i]);
112 if (B >= 512) {
if (t < 256) { sd[t] = op(sd[t], sd[t + 256]); } __syncthreads(); }
113 if (B >= 256) {
if (t < 128) { sd[t] = op(sd[t], sd[t + 128]); } __syncthreads(); }
114 if (B >= 128) {
if (t < 64 ) { sd[t] = op(sd[t], sd[t + 64 ]); } __syncthreads(); }
115 if (t < 32 ) warp_reduce<T, B, Op>(sd, t, op);
117 if (t == 0) g_odata[blockIdx.x] = sd[0];
120template <
class T,
unsigned int B,
unsigned int HN,
size_t N,
class Op>
122void reduce2(T (*g_odata)[HN],
const T (*g_idata)[N],
size_t n, Op op = Op())
124 __shared__ T sd[HN][B];
125 unsigned int t = threadIdx.x;
127 for (
int j = 0; j < HN; ++j)
129 for (
int i = t + blockIdx.x * B; i <
n; i += B * gridDim.x) {
131 for (
int j = 0; j < HN; ++j)
132 sd[j][t] = op(sd[j][t], g_idata[i][j]);
137 if (B >= 512) {
if (t < 256) { _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) sd[j][t] = op(sd[j][t], sd[j][t + 256]); } __syncthreads(); }
138 if (B >= 256) {
if (t < 128) { _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) sd[j][t] = op(sd[j][t], sd[j][t + 128]); } __syncthreads(); }
139 if (B >= 128) {
if (t < 64 ) { _Pragma(
"unroll")
for (
int j = 0; j < HN; ++j) sd[j][t] = op(sd[j][t], sd[j][t + 64 ]); } __syncthreads(); }
140 if (t < 32 ) warp_reduce2<T, HN, B, Op>(sd, t, op);
144 for (
int j = 0; j < HN; ++j)
145 g_odata[blockIdx.x][j] = sd[j][0];
int n
Number of atoms padded by WARP_SIZE.
__global__ void reduce(T *g_odata, const T *g_idata, size_t n, Op op=Op())
Definition: reduce.h:101
__device__ void warp_reduce2(volatile T(*sd)[B], unsigned int t, Op op)
Definition: reduce.h:76
__global__ void reduce2(T(*g_odata)[HN], const T(*g_idata)[N], size_t n, Op op=Op())
Definition: reduce.h:122
__device__ void warp_reduce(volatile T *sd, unsigned int t, Op op)
Definition: reduce.h:51
__device__ void x(volatile T &a, T b) const
Definition: reduce.h:43
__device__ T operator()(T a, T b) const
Definition: reduce.h:37
__device__ T init() const
Definition: reduce.h:31
__device__ T init() const
Definition: reduce.h:9
__device__ T operator()(T a, T b) const
Definition: reduce.h:15
__device__ void x(volatile T &a, T b) const
Definition: reduce.h:21