FREE Subscription to Dr. Dobb’s Digest: Same Great Content, New Digital Edition
Site Archive (Complete)
Architecture & Design
Email
Print
Reprint

add to:
Del.icio.us
Digg
Google
Furl
Slashdot
Y! MyWeb
Blink
TABLE OF CONTENTS
June 23, 2009

CUDA, Supercomputing for the Masses: Part 13

(Page 3 of 4)

An Example

Let's take a look at the following very simple example, readTexels.cu, which demonstrates how to bind a texture to a CUDA array and sets the filterMode attribute to cudaFilterModeLinear.

//readTexels.cu #include <stdio.h>

void checkCUDAError(const char *msg) { cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); exit(EXIT_FAILURE); } }

texture<float, 1, cudaReadModeElementType> texRef;

__global__ void readTexels(int n, float *d_out) { int idx = blockIdx.x*blockDim.x + threadIdx.x; if(idx < n) { //Note: Appendix D.2 gives formula for interpolation float x = tex1D(texRef, float(idx)); d_out[idx] = x; } }

#define NUM_THREADS 256

int main() { int N = 10; // 10 is illustrative and should be larger in practice int nBlocks = N/NUM_THREADS + ((N % NUM_THREADS)?1:0); float *d_out; // allocate space on the device for the results cudaMalloc((void**)&d_out, sizeof(float) * N);

// allocate space on the host for the results float *h_out = (float*)malloc(sizeof(float)*N);

// data fill array with increasing values float *data = (float*)malloc(N*sizeof(float)); for (int i = 0; i < N; i++) data[i] = float(i); // create a CUDA array on the device cudaArray* cuArray; cudaMallocArray (&cuArray, &texRef.channelDesc, N, 1); cudaMemcpyToArray(cuArray, 0, 0, data, sizeof(float)*N, cudaMemcpyHostToDevice); // bind a texture to the CUDA array cudaBindTextureToArray (texRef, cuArray);

// host side settable texture attributes texRef.normalized = false; texRef.filterMode = cudaFilterModeLinear; // read texels from texture readTexels<<<nBlocks, NUM_THREADS>>>(N, d_out); // copy texels to host cudaMemcpy(h_out, d_out, sizeof(float)*N, cudaMemcpyDeviceToHost); // look at them for (int i = 0; i << N; i++) { printf("%f\n",h_out[i]); } free(h_out);

cudaFree(d_out); cudaFreeArray(cuArray); cudaUnbindTexture(texRef); checkCUDAError("cuda free operations"); }

Under Linux, the following nvcc command-line can be used to build this program:

nvcc readTexel.cu "o readTexel

On the host side, the texture reference, texRef, is created with:

texture<float, 1, cudaReadModeElementType> texRef;

A CUDA array, cuArray, is allocated and initialized:

// create a CUDA array on the device cudaArray* cuArray; cudaMallocArray (&cuArray, &texRef.channelDesc, N, 1);

The texRef texture is then bound to cuArray and the texture attributes are set. In this case, we specify linear interpolation and we will not be using normalized texture coordinates.

// bind a texture to the CUDA array cudaBindTextureToArray (texRef, cuArray);

// host side settable texture attributes texRef.normalized = false; texRef.filterMode = cudaFilterModeLinear;

The kernel, readTexels(), simply fetches values from the texture unit and places them in the d_out array.

//Note: Appendix D.2 gives formula for interpolation float x = tex1D(texRef, float(idx)); d_out[idx] = x;

The d_out array is then copied back to the host and printed out on the screen. Finally, the texture is released with the call:

cudaUnbindTexture(texRef);

Playing with the attributes and data in this simple example might help clarify the processing capabilities of texture memory. For this example, you should see the following output demonstrating that the texture is interpolating between data points.

0.000000 0.500000 1.500000 2.500000 3.500000 4.500000 5.500000 6.500000 7.500000 8.500000

Example 1: Binding a texture to linear memory that is updated in-place.

The following simple example, negateArray.cu, binds a 1D texture to linear memory. The texture is used to fetch floating-point values from the linear memory and the texture is then updated in-place. The results are then brought back to the host and checked for correctness.

#include <stdio.h> #include <assert.h>

void checkCUDAError(const char *msg) { cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); exit(EXIT_FAILURE); } }

texture<float, 1, cudaReadModeElementType> texRef;

__global__ void kernel(int n, float *d_out) { int idx = blockIdx.x*blockDim.x + threadIdx.x;

if(idx < n) { d_out[idx] = -tex1Dfetch(texRef, idx); } }

#define NUM_THREADS 256 int main() { int N = 2560; int nBlocks = N/NUM_THREADS + ((N % NUM_THREADS)?1:0); int memSize = N*sizeof(float); // data fill array with increasing values float *data; data = (float*) malloc(memSize); for (int i = 0; i < N; i++) data[i] = float(i); float *d_a; cudaMalloc( (void **) &d_a, memSize ); cudaMemcpy( d_a, data, memSize, cudaMemcpyHostToDevice ); cudaBindTexture(0,texRef,d_a,memSize); checkCUDAError("bind"); kernel<<<nBlocks, NUM_THREADS>>>(N, d_a);

float *h_out = (float*)malloc(memSize); cudaMemcpy(h_out, d_a, memSize, cudaMemcpyDeviceToHost); checkCUDAError("cudaMemcpy"); for (int i = 0; i <<N; i++) { assert(data[i] == -h_out[i]); } printf("Correct\n"); cudaUnbindTexture(texRef); checkCUDAError("cudaUnbindTexture");

free(h_out); free(data); }

There are a few minor but important differences between negateArray.cu and the previous readTexels.cu example.

The first real difference is that we allocate a linear region of memory, d_a, with cudaMalloc():

float *d_a; cudaMalloc( (void **) &d_a, memSize );

This linear memory is bound to a texture with the following:

cudaBindTexture(0,texRef,d_a,memSize); checkCUDAError("bind");

On the device, tex1Dfetch() is used to fetch the data, which is then negated and written to d_out:

d_out[idx] = -tex1Dfetch(texRef, idx);

Please note that the kernel call passed d_a, which means that the data is updated in-place:

kernel<<<nBlocks, NUM_THREADS>>>(N, d_a);

Example 2: Revisiting the reverseArray_multiblock.cu example

Finally, let's revisit the reverseArray_multiblock.cu example, which was discussed in detail in Part 3 of this series and adapt it to use texture memory. As can be seen in the source for reverseArray_multiblockTexture.cu below, only a few minor changes were needed to change from using a linear array to a texture object bound to the linear region of memory, d_a, allocated with cudaMalloc. For convenience, changes from reverseArray_multiblock.cu are highlighted with red and the "* Texture Specific *" string.

// reverseArray_multiblockTexture.cu

// includes, system #include <stdio.h> #include <assert.h>

// Simple utility function to check for CUDA runtime errors void checkCUDAError(const char* msg);

// ****************** Texture Specific ******************* // Note: default mode is cudaReadModeElementType // section 4.3.4.1 of the NVIDIA CUDA Programming Guide texture<int, 1> tex_d_a;

// Part3: implement the kernel __global__ void reverseArrayTexture(int *d_out, int *d_in) { int inOffset = blockDim.x * blockIdx.x; int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x); int in = inOffset + threadIdx.x; int out = outOffset + (blockDim.x - 1 - threadIdx.x);

// ****************** Texture Specific ******************* d_out[out] = tex1Dfetch(tex_d_a,in); }

// Program main int main( int argc, char** argv) { // pointer for host memory and size int *h_a; int dimA = 256 * 1024; // 256K elements (1MB total) // pointer for device memory int *d_b, *d_a; // define grid and block size int numThreadsPerBlock = 256; // Part 1: compute number of blocks needed based on // array size and desired block size int numBlocks = dimA / numThreadsPerBlock; // allocate host and device memory size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int); h_a = (int *) malloc(memSize); cudaMalloc( (void **) &d_a, memSize ); cudaMalloc( (void **) &d_b, memSize );

// ****************** Texture Specific ******************* // Bind the device array d_a to a texture object tex_d_a cudaBindTexture(NULL,tex_d_a,d_a); checkCUDAError("Bind Texture"); // Initialize input array on host for (int i = 0; i < dimA; ++i) { h_a[i] = i; } // Copy host array to device array cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice ); // launch kernel dim3 dimGrid(numBlocks); dim3 dimBlock(numThreadsPerBlock); reverseArrayTexture<<< dimGrid, dimBlock >>>( d_b, d_a ); // block until the device has completed cudaThreadSynchronize(); // check if kernel execution generated an error // Check for any CUDA errors checkCUDAError("kernel invocation"); // device to host copy cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost ); // Check for any CUDA errors checkCUDAError("memcpy"); // verify the data returned to the host is correct for (int i = 0; i < dimA; i++) { assert(h_a[i] == dimA - 1 - i ); } // ****************** Texture Specific ******************* cudaUnbindTexture(tex_d_a); checkCUDAError("Unbind Texture");

// free device memory cudaFree(d_a); cudaFree(d_b); // free host memory free(h_a); // If the program makes it this far, then the results are // correct and there are no run-time errors. Good work! printf("Correct!\n"); return 0; }

void checkCUDAError(const char *msg) { cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); exit(EXIT_FAILURE); } }

In short summary form, CUDA texturing requires the following steps:

  • Host (CPU) code
    • Allocate/obtain memory (linear memory, pitch linear memory, or CUDA array)
    • Create a texture reference object

      • Currently must be at file-scope

    • Bind the texture reference to memory/array
    • When done

      • Unbind the texture reference, free resources

  • Device (kernel) code

    • Fetch using texture reference
    • Linear memory textures

      • tex1Dfetch

    • Array textures and pitch linear memory

      • tex1D, tex2D, or tex3D

This structure can be seen in reverseArray_multiblockTexture.cu:

  • Host (CPU) code:

    // reverseArray_multiblockTexture.cu ... // ****************** Texture Specific ******************* // Note: default mode is cudaReadModeElementType // section 4.3.4.1 of the NVIDIA CUDA Programming Guide texture<int, 1> tex_d_a; // Program main int main( int argc, char** argv) { ... // pointer for device memory int *d_b, *d_a; ... cudaMalloc( (void **) &d_a, memSize ); ... // ****************** Texture Specific ******************* // Bind the device array d_a to a texture object tex_d_a cudaBindTexture(NULL,tex_d_a,d_a); checkCUDAError("Bind Texture"); ... // ****************** Texture Specific ******************* cudaUnbindTexture(tex_d_a); checkCUDAError("Unbind Texture"); ... }

  • Device (kernel) code:

    
    // Part3: implement the kernel 
    __global__ void reverseArrayTexture(int *d_out, int *d_in) 
    {
      // ****************** Texture Specific *******************
      d_out[out] = tex1Dfetch(tex_d_a,in);
    }
    
Previous Page | 1 Introduction | 2 Linear Memory | 3 An Example | 4 Conclusion Next Page
TOP 5 ARTICLES
No Top Articles.



MICROSITES
FEATURED TOPIC

ADDITIONAL TOPICS

INFO-LINK