masterxilo masterxilo - 3 months ago 20
C++ Question

Is there a preprocessor macro in CUDA that tells whether we are compiling device code?

I'd like to define a pointer type that is shared between device and host code and internally stores the device and host pointer to the shared memory. I want it to determine at compile time, which pointer to actually return:

#define F inline __host__ __device__

class SharedMemory;
/**
*
* Can only be set by allocating shared memory.
*/
template<typename T>
class SharedMemoryPtr {
public:
SharedMemoryPtr() : hptr(0), dptr(0) {}

//F ~ SharedMemoryPtr() {cudaFreeHost(hptr);} // Should be freed explicitly (?)

// TODO: Don't allow copying/overwriting (at least not without freeing memory...)

F T& operator() () {
#ifdef __CUDACC__
return *dptr;
#else
return *hptr;
#endif
};

F T* operator-> () {
#ifdef __CUDACC__
return dptr;
#else
return hptr;
#endif
};

F T& operator * () {
#ifdef __CUDACC__
return *dptr;
#else
return *hptr;
#endif
};

F T& operator[] (__int64 i) const {
#ifdef __CUDACC__
return *(dptr + i);
#else
return *(hptr + i);
#endif
};

friend SharedMemory;

// TODO: Make obsolete (possible?)

T * getHptr() {return hptr;}
T * getDptr() {return dptr;}

private:

T *hptr, *dptr;
};

class SharedMemory {
public:
template<typename T>
static SharedMemoryPtr<T> allocate(int count = 1) {
assert(count > 0);

SharedMemoryPtr<T> sptr;

cutilSafeCall(
cudaHostAlloc(&sptr.hptr, sizeof(T) * count, cudaHostAllocMapped));
assert(sptr.hptr);
cutilSafeCall(
cudaHostGetDevicePointer(&sptr.dptr, sptr.hptr, 0));
assert(sptr.dptr);

return sptr;
}
};


This works fine as long as I use these pointers in code that is either in a cpp file (where
__CUDACC__
is never defined) or a .h file (where
__CUDACC__
is only defined if the function is used by some function in a cu file). However in a
__host__
function in a .cu file, I get the
devptr
. Obviously, .cu files are processed exclusively by nvcc. Is there some other preprocessor macro that is defined ONLY for
__global__
and
__device__
functions, not just everything nvcc happens to process? Or do I need to separate my code?

Answer

__CUDA_ARCH__ is actual only for device code. You can specify device code behavior with it. This macro actually get compute capability of device code (like 200 for 2.0.)