3#include "math/libfunc.h"
4#include "math/parallel.h"
5#include "tool/accasync.h"
6#include "tool/ptrtrait.h"
67 static_assert(std::is_enum<T>::value || std::is_integral<T>::value || std::is_floating_point<T>::value || std::is_trivial<T>::value,
"");
72template <
class DT,
class ST>
79 deviceMemoryCheckType<DT>();
80 deviceMemoryCheckType<ST>();
81 constexpr size_t ds =
sizeof(DT);
82 constexpr size_t ss =
sizeof(ST);
84 size_t size = ds * nelem;
88 std::vector<DT> buf(nelem);
89 for (
size_t i = 0; i < nelem; ++i)
98template <
class DT,
class ST>
105 deviceMemoryCheckType<DT>();
106 deviceMemoryCheckType<ST>();
107 constexpr size_t ds =
sizeof(DT);
108 constexpr size_t ss =
sizeof(ST);
110 size_t size = ss * nelem;
114 std::vector<ST> buf(nelem);
117 for (
size_t i = 0; i < nelem; ++i)
134 return reinterpret_cast<T*
>(p);
146 template <
class PTR,
class... PTRS>
147 static void allocate(
size_t nelem, PTR* pp, PTRS... pps)
159 template <
class PTR,
class... PTRS>
167 static void zero(
int q,
size_t nelem, PTR p)
174 template <
class PTR,
class... PTRS>
175 static void zero(
int q,
size_t nelem, PTR p, PTRS... ps)
178 zero(q, nelem, ps...);
181 template <
class PTR,
class U>
182 static void copyin(
int q,
size_t nelem, PTR dst,
const U* src)
188 template <
class U,
class PTR>
189 static void copyout(
int q,
size_t nelem, U* dst,
const PTR src)
196 template <
class PTR,
class U>
197 static void copy(
int q,
size_t nelem, PTR dst,
const U* src)
202 static_assert(std::is_same<DT, ST>::value,
"");
203 size_t size = N *
sizeof(ST) * nelem;
208 template <
class PTR,
class PTR2>
214 static_assert(std::is_same<T, T2>::value,
"");
215 return dotProd(flatten(ptr), flatten(b), nelem * N, q);
219 template <
class ANS,
class PTR,
class PTR2>
220 static void dot(
int q,
size_t nelem, ANS ans,
const PTR ptr,
const PTR2 ptr2)
225 static_assert(std::is_same<T, T2>::value,
"");
227 static_assert(std::is_same<T, TA>::value,
"");
228 dotProd(ans, flatten(ptr), flatten(ptr2), nelem * N, q);
231 template <
class FLT,
class PTR>
232 static void scale(
int q,
size_t nelem, FLT scal, PTR ptr)
238 template <
class FLT,
class PTR,
class... PTRS>
239 static void scale(
int q,
size_t nelem, FLT scal, PTR ptr, PTRS... ptrs)
241 scale(q, nelem, scal, ptr);
242 scale(q, nelem, scal, ptrs...);
Definition: ptrtrait.h:19
T dotProd(const T *a, const T *b, size_t nelem, int queue)
Dot product of two linear arrays.
Definition: parallel.h:82
void scaleArray(T *dst, T scal, size_t nelem, int queue)
Multiply all of the elements in an 1D array by a scalar.
Definition: parallel.h:102
static void allocate(size_t nelem, PTR *pp, PTRS... pps)
Definition: darray.h:147
static void deallocate(PTR p)
Definition: darray.h:154
static void copyout(int q, size_t nelem, U *dst, const PTR src)
Definition: darray.h:189
static void scale(int q, size_t nelem, FLT scal, PTR ptr)
Definition: darray.h:232
static void zero(int q, size_t nelem, PTR p)
Definition: darray.h:167
static void dot(int q, size_t nelem, ANS ans, const PTR ptr, const PTR2 ptr2)
Calculates the dot product and saves the answer to pointer ans.
Definition: darray.h:220
static void scale(int q, size_t nelem, FLT scal, PTR ptr, PTRS... ptrs)
Definition: darray.h:239
static void copy(int q, size_t nelem, PTR dst, const U *src)
Copies data across two device memory pointers.
Definition: darray.h:197
static void zero(int q, size_t nelem, PTR p, PTRS... ps)
Definition: darray.h:175
static void copyin(int q, size_t nelem, PTR dst, const U *src)
Definition: darray.h:182
static void deallocate(PTR p, PTRS... ps)
Definition: darray.h:160
static PtrTrait< PTR >::type dotThenReturn(int q, size_t nelem, const PTR ptr, const PTR2 b)
Calculates the dot product and returns the answer to the host.
Definition: darray.h:209
static void allocate(size_t nelem, PTR *pp)
Definition: darray.h:139
Device array.
Definition: darray.h:128
void deviceMemoryAllocateBytes(void **pptr, size_t nbytes)
Allocates device pointer.
void deviceMemoryCopyinBytesAsync(void *dst, const void *src, size_t nbytes, int queue)
Similar to OpenACC async copyin, copies data from host to device.
void deviceMemoryZeroBytesAsync(void *dst, size_t nbytes, int queue)
Writes zero bytes on device.
void deviceMemoryCheckType()
Sanity check.
Definition: darray.h:65
void waitFor(int queue)
Similar to OpenACC wait and CUDA stream synchronize.
void deviceMemoryDeallocate(void *ptr)
Deallocates device pointer.
void deviceMemoryCopyoutBytesAsync(void *dst, const void *src, size_t nbytes, int queue)
Similar to OpenACC async copyout, copies data from device to host.
void deviceMemoryCopyBytesAsync(void *dst, const void *src, size_t nbytes, int queue)
Copies data between two pointers on device.
void deviceMemoryCopyin1dArray(DT *dst, const ST *src, size_t nelem, int q)
Copies data to 1D array, host to device.
Definition: darray.h:73
void deviceMemoryCopyout1dArray(DT *dst, const ST *src, size_t nelem, int q)
Copies data to 1D array, device to host.
Definition: darray.h:99