JRsz JRsz - 15 days ago 14
C++ Question

CUDA matrix transpose with shared memory

I need to implement a matrix transpose function on a GPU using shared memory. I have done it in a simple way without shared memory which works fine and also an attempt with SM. But unfortunately the calculation is not correct and I can not figure out why. A complete working example can be found here and at the bottom of this question.

EDIT 1

I further know that the first index of the result where I have a wrong value is index 32 (of the flatted matrix, so

matr[0][32]
in a two dimensional fashion).

If there are any more information neede I will prvide them gladly.

A short extract from the entire code which resembles the not working function is listed below:

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width,
const int height, const int nreps)
{
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
int blockIdx_y = blockIdx.x;
int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
int index_in = xIndex + (yIndex)* width;

xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
int index_out = xIndex + (yIndex)* height;

int r, i;
#pragma unroll
for (r = 0; r < nreps; r++)
{
#pragma unroll
for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];

__syncthreads();

#pragma unroll
for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if (index_in + i * width < width * height)
matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
}
}


The output looks like this:

Avg. CPU Transpose Time: 0.106048 ms, Bandwidth: 3.771873 GB/s

Avg. GPU Naive Trans Time: 0.009871 ms, bandwidth: 40.520836 GB/s
Correct: 50000, Wrong: 0

Avg. GPU Trans with SM Time: 0.007598 ms, bandwidth: 52.643482 GB/s
Correct: 12352, Wrong: 37648


Here is the full working example. I stripped most of the unnecessary code from it so it is less stuffed:

#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"

#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

#define TILE_DIM 32
#define BLOCK_ROWS 8
#define BLOCK_COLS 32

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation);
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);

int main()
{
int i, width, height, nreps, size, wrong, correct;
double cpuTime, cpuBandwidth;
cudaError_t cudaStatus;

float *matrA, *matrATC, *matrATG, *matrAC;

srand(time(NULL));

nreps = 10000;
width = 500;
height = 100;
size = width * height;

matrA = (float*)malloc(size * sizeof(float)); // matrix A
matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied
matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU
matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU

for (i = 0; i < size; i++)
{
matrA[i] = (float)i;
}

auto start = std::chrono::high_resolution_clock::now();

//CPU Transpose
cpuMatrTrans(matrATC, matrA, width, height, nreps);

auto end = std::chrono::high_resolution_clock::now();

std::chrono::duration<double> diff = end - start;
cpuTime = (diff.count() * 1000) / nreps;
cpuBandwidth = (sizeof(float) * size * 2) / (cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);

correct = 0;
wrong = 0;

//Naive transpose
matrMagicCuda(matrATG, matrA, width, height, nreps, 1);

//Check if calc was correct
for (i = 0; i < size; i++)
{
if (matrATC[i] != matrATG[i])
{
/*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
return;*/
wrong++;
}
else
{
correct++;
}
}

printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
correct = 0;
wrong = 0;

//Transpose with shared memory
matrMagicCuda(matrATG, matrA, width, height, nreps, 2);

//Check if calc was correct
for (i = 0; i < size; i++)
{
if (matrATC[i] != matrATG[i])
{
/*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
return;*/
wrong++;
}
else
{
correct++;
}
}

//printf("\tTranspose with SM on GPU was executed correctly.\n\n");
printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
correct = 0;
wrong = 0;

// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset failed!\n");
return 1;
}

return 0;
}

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation)
{
float elapsed = 0;
float *dev_matrA = 0;
float *dev_matrB = 0;
cudaError_t cudaStatus;
dim3 dim_grid, dim_block;
double gpuBandwidth;

int size = width * height;

dim_block.x = TILE_DIM;
dim_block.y = BLOCK_ROWS;
dim_block.z = 1;

dim_grid.x = (width + TILE_DIM - 1) / TILE_DIM;
dim_grid.y = (height + TILE_DIM - 1) / TILE_DIM;
dim_grid.z = 1;

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}

// Allocate GPU buffers for three matrix
cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

// Copy input matrix from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

switch (operation)
{
case(1):
{
cudaEventRecord(start);
// Launch a kernel on the GPU with one thread for each element.
naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

cudaEventRecord(stop);
cudaEventSynchronize(stop);

cudaEventElapsedTime(&elapsed, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);

elapsed /= nreps;

gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

break;
}

case(2):
{
cudaEventRecord(start);
// Launch a kernel on the GPU with one thread for each element.
notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

cudaEventRecord(stop);
cudaEventSynchronize(stop);

cudaEventElapsedTime(&elapsed, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);

elapsed /= nreps;

gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

break;
}

default:
printf("No matching opcode was found.\n");
}

// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
goto Error;
}

// Copy output matrix from GPU buffer to host memory.
cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

Error:
cudaFree(dev_matrB);
cudaFree(dev_matrA);

return cudaStatus;
}

void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
int i, j, r;

#pragma unroll
for (r = 0; r < nreps; r++)
#pragma unroll
for (i = 0; i < height; i++)
#pragma unroll
for (j = 0; j < width; j++)
matrB[j * height + i] = matrA[i * width + j];
}

__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
int i, r;
int row = blockIdx.x * TILE_DIM + threadIdx.x;
int col = blockIdx.y * TILE_DIM + threadIdx.y;
int index_in = row + width * col;
int index_out = col + height * row;

#pragma unroll
for (r = 0; r < nreps; r++)
#pragma unroll
for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if (index_in + i * width < width * height)
matrB[index_out + i] = matrA[index_in + i * width];
}

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
int blockIdx_y = blockIdx.x;
int blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
int index_in = xIndex + (yIndex)* width;

xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
int index_out = xIndex + (yIndex)* height;

int r, i;
#pragma unroll
for (r = 0; r < nreps; r++)
{
#pragma unroll
for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];

__syncthreads();

#pragma unroll
for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if (index_in + i * width < width * height)
matrB[index_out + i * height] = tile[threadIdx.x][threadIdx.y + i];
}
}

Answer

There were a number of issues with this code. I'm not sure I will be able to cover all of them.

Possibly the most important issue was your lack of (and lack of understanding of) a proper 2D thread check. Your algorithm creates a grid of threads that is larger in both dimensions than the problem size. This creates logical threads outside of the dimension of your matrix, in both dimensions.

You have attempted to create a 2D thread check like this:

        if (index_in + i * width < width * height)

This will not work. Suppose I have a 3x3 matrix and a 4x4 grid of threads. The thread at (3,0) is clearly out-of-bounds for your matrix, but would pass your 2D thread check.

In this situation, a proper thread check must test each dimension separately, not as a product.

Note that this logical error exists in your "naive" transpose kernel as well, and you can confirm that if you run your code with cuda-memcheck. It will indicate out-of-bounds access errors in that kernel, even though it appears to work correctly.

There were various other problems as well. Most of these had to do with indexing in your shared memory kernel. It's not clear to me that you understood the necessary indexing manipulation for a shared memory transpose. There are two separate indexing transposes that we must do in this case:

  1. Transpose the block (tile) indices
  2. Transpose the thread indices

Transposition of the thread indices is done upon reading/writing shared memory. You have this correctly accounted for with a reversal of the use of threadIdx.x and threadIdx.y for read/write of shared memory. But as near as I can tell, your index generation for the reversal of block indices (which reversal is used during the reading/writing to global memory) was broken. That was the other major issue to be sorted out.

The following code has these and some other issues fixed, and appears to work correctly for me:

$ cat t33.cu
#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"

#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

#define TILE_DIM 32
#define BLOCK_ROWS 8
#define BLOCK_COLS 32

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation);
void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);
__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps);

int main()
{
    int i, width, height, nreps, size, wrong, correct;
    double cpuTime, cpuBandwidth;
    cudaError_t cudaStatus;

    float *matrA, *matrATC, *matrATG, *matrAC;

    srand(time(NULL));

    nreps = 10000;
    width = 500;
    height = 100;


    size = width * height;

    matrA = (float*)malloc(size * sizeof(float)); // matrix A
    matrAC = (float*)malloc(size * sizeof(float)); // matrix A copied
    matrATC = (float*)malloc(size * sizeof(float)); // matrix A transposed by CPU
    matrATG = (float*)malloc(size * sizeof(float)); // matrix A transposed by GPU

    for (i = 0; i < size; i++)
    {
        matrA[i] = (float)i;
    }

    auto start = std::chrono::high_resolution_clock::now();

    //CPU Transpose
    cpuMatrTrans(matrATC, matrA, width, height, nreps);

    auto end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> diff = end - start;
    cpuTime = (diff.count() * 1000) / nreps;
    cpuBandwidth = (sizeof(float) * size * 2) / (cpuTime * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
    printf("Avg. CPU Transpose Time: %f ms, Bandwidth: %f GB/s\n\n", cpuTime, cpuBandwidth);

    correct = 0;
    wrong = 0;

    //Naive transpose
    memset(matrATG, 0, size*sizeof(float));
    matrMagicCuda(matrATG, matrA, width, height, nreps, 1);

    //Check if calc was correct
    for (i = 0; i < size; i++)
    {
        if (matrATC[i] != matrATG[i])
        {
            /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
            return;*/
            wrong++;
        }
        else
        {
            correct++;
        }
    }

    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
    correct = 0;
    wrong = 0;

    //Transpose with shared memory
    memset(matrATG, 0, size*sizeof(float));
    matrMagicCuda(matrATG, matrA, width, height, nreps, 2);

    //Check if calc was correct
    for (i = 0; i < size; i++)
    {
        if (matrATC[i] != matrATG[i])
        {
            /*printf("ERROR - %d - ATC:%f - ATG:%f\n\n", i, matrATC[i], matrATG[i]);
            return;*/
            wrong++;
        }
        else
        {
            correct++;
        }
    }

    //printf("\tTranspose with SM on GPU was executed correctly.\n\n");
    printf("\tCorrect: %d, Wrong: %d\n\n", correct, wrong);
    correct = 0;
    wrong = 0;

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceReset failed!\n");
        return 1;
    }

    return 0;
}

cudaError_t matrMagicCuda(float *matrB, float *matrA, const int width, const int height, const int nreps, const int operation)
{
    float elapsed = 0;
    float *dev_matrA = 0;
    float *dev_matrB = 0;
    cudaError_t cudaStatus;
    dim3 dim_grid, dim_block;
    double gpuBandwidth;

    int size = width * height;

    dim_block.x = TILE_DIM;
    dim_block.y = BLOCK_ROWS;
    dim_block.z = 1;

    dim_grid.x = (width + TILE_DIM - 1) / TILE_DIM;
    dim_grid.y = (height + TILE_DIM - 1) / TILE_DIM;
    dim_grid.z = 1;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three matrix
    cudaStatus = cudaMalloc((void**)&dev_matrA, size * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_matrB, size * sizeof(float));
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input matrix from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_matrA, matrA, size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaMemset(dev_matrB, 0, size * sizeof(float));
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    switch (operation)
    {
        case(1):
        {
            cudaEventRecord(start);
            // Launch a kernel on the GPU with one thread for each element.
            naiveTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

            cudaEventRecord(stop);
            cudaEventSynchronize(stop);

            cudaEventElapsedTime(&elapsed, start, stop);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);

            elapsed /= nreps;

            gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
            printf("Avg. GPU Naive Trans Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

            break;
        }

        case(2):
        {
            cudaEventRecord(start);
            // Launch a kernel on the GPU with one thread for each element.
            notSoNaivaTransKernel << <dim_grid, dim_block >> >(dev_matrB, dev_matrA, width, height, nreps);

            cudaEventRecord(stop);
            cudaEventSynchronize(stop);

            cudaEventElapsedTime(&elapsed, start, stop);
            cudaEventDestroy(start);
            cudaEventDestroy(stop);

            elapsed /= nreps;

            gpuBandwidth = (sizeof(float) * size * 2) / (elapsed * 1000000);//scaling from ms to s and B to GB doen implicitly, shortened in fraction, times two for read and write
            printf("Avg. GPU Trans with SM Time: %f ms, bandwidth: %f GB/s\n", elapsed, gpuBandwidth);

            break;
        }

    default:
        printf("No matching opcode was found.\n");
    }

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output matrix from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(matrB, dev_matrB, size * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_matrB);
    cudaFree(dev_matrA);

    return cudaStatus;
}

void cpuMatrTrans(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    int i, j, r;

#pragma unroll
    for (r = 0; r < nreps; r++)
#pragma unroll
        for (i = 0; i < height; i++)
#pragma unroll
            for (j = 0; j < width; j++)
                matrB[j * height + i] = matrA[i * width + j];
}

__global__ void naiveTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    int i, r;
    int col = blockIdx.x * TILE_DIM + threadIdx.x;
    int row = blockIdx.y * TILE_DIM + threadIdx.y;
    int index_in = col + width * row;
    int index_out = row + height * col;

#pragma unroll
    for (r = 0; r < nreps; r++)
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if ((row+i<height) && (col < width))
                matrB[index_out + i] = matrA[index_in + i * width];
}

__global__ void notSoNaivaTransKernel(float *matrB, float *matrA, const int width, const int height, const int nreps)
{
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];
    int ciIndex = blockIdx.x * TILE_DIM + threadIdx.x;
    int riIndex = blockIdx.y * TILE_DIM + threadIdx.y;
    int coIndex = blockIdx.y * TILE_DIM + threadIdx.x;
    int roIndex = blockIdx.x * TILE_DIM + threadIdx.y;
    int index_in = ciIndex + (riIndex)* width;
    int index_out = coIndex + (roIndex)* height;


    int r, i;
#pragma unroll
    for (r = 0; r < nreps; r++)
    {
#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if ((ciIndex<width) && (riIndex+i < height))
              tile[threadIdx.y + i][threadIdx.x] = matrA[index_in + i * width];
        __syncthreads();

#pragma unroll
        for (i = 0; i < TILE_DIM; i += BLOCK_ROWS)
            if ((coIndex<height) && (roIndex+i < width))
               matrB[index_out + i*height] = tile[threadIdx.x][threadIdx.y + i];
        __syncthreads();
    }
}
$ nvcc -std=c++11 -arch=sm_61 -o t33 t33.cu
t33.cu(25): warning: variable "matrAC" was set but never used

t33.cu(25): warning: variable "matrAC" was set but never used

$ cuda-memcheck ./t33
========= CUDA-MEMCHECK
Avg. CPU Transpose Time: 0.143087 ms, Bandwidth: 2.795509 GB/s

Avg. GPU Naive Trans Time: 0.028587 ms, bandwidth: 13.992195 GB/s
        Correct: 50000, Wrong: 0

Avg. GPU Trans with SM Time: 0.040328 ms, bandwidth: 9.918678 GB/s
        Correct: 50000, Wrong: 0

========= ERROR SUMMARY: 0 errors
$ ./t33
Avg. CPU Transpose Time: 0.140469 ms, Bandwidth: 2.847594 GB/s

Avg. GPU Naive Trans Time: 0.003828 ms, bandwidth: 104.505440 GB/s
        Correct: 50000, Wrong: 0

Avg. GPU Trans with SM Time: 0.000715 ms, bandwidth: 559.206604 GB/s
        Correct: 50000, Wrong: 0

$

Note: The code attempts to measure bandwidth. However, you should be aware that the measured bandwidth here is affected by the cache bandwidth. Your matrix sizes (500x100 = 200Kbytes each for input and output) here are easily small enough to fit in the L2 cache on most GPUs. This fact, coupled with the fact that you are running the same transpose multiple times (nreps) means that much of the work is operating directly out of the L2 cache. Therefore, in the "optimized" case above, we see a reported bandwidth number that greatly exceeds the available memory bandwidth of the GPU (this case happens to be a Pascal Titan X, so about ~340GB/s of main memory bandwidth available). This is due to the fact that this measurement includes some benefit from the L2 cache, whose bandwidth is at least twice as high as main memory bandwidth. You can eliminate this effect by using a significantly larger matrix size and/or reducing nreps to 1.