#include #include #include #include #include #include #include #include "TimerWin32.hpp" #include #define ENTRADA_PGM "..//..//data//xadrez_preenchido.pgm" #define PADRAO1_PGM "..//..//data//padrao1.pgm" #define PADRAO2_PGM "..//..//data//padrao2.pgm" #define SAIDA_PGM "..//..//data//saida.pgm" #define BLOCK_X 16 #define BLOCK_Y 16 // kernel(dst, src, width, padrao1, padrao2, d_counter) // uint t = 0; __device__ unsigned int d_counter; __global__ void kernel(uint2 *out, unsigned int *dst, unsigned int *src, unsigned int width, unsigned int *padrao) { unsigned int x = __umul24(blockDim.x,blockIdx.x) + threadIdx.x; unsigned int y = __umul24(blockDim.y,blockIdx.y) + threadIdx.y; unsigned int index = __umul24(y,width) + x; //unsigned int index = __umul24(blockDim.x, blockIdx.x) + threadIdx.x; unsigned int total = 0; for (unsigned int j = 0; j < 5; ++j) { unsigned int jndex = j*5; unsigned int jw = j*width; for (unsigned int i = 0; i < 5; ++i) { //total = total + shared[threadIdx.y+j][threadIdx.x+i]; total |= src[index+jw+i] ^ padrao[jndex+i]; //if (src[index+jw+i] == padrao[jndex+i]) // total += 1; //total2 |= src[index] ^ padrao2[jndex+i]; } } //uint2 asd = make_uint2(2,2); //out[1] = asd; dst[index] = total*10; if (total == 25) { int currentIndex = atomicAdd(&d_counter, 1); uint2 pixelIndex = make_uint2(x,y); out[currentIndex] = pixelIndex; uint2 asd = make_uint2(2+currentIndex,2+currentIndex); out[0] = asd; } else { //uint2 asd3 = make_uint2(3,total); //out[0] = asd3; } } extern "C" int runCUDAKernel(int argc, char* argv[]) { CUT_DEVICE_INIT(argc, argv); unsigned int *h_image = NULL; unsigned int *h_image_p = NULL; unsigned int *h_padrao1 = NULL; unsigned int *h_padrao2 = NULL; uint2 *h_target = NULL; unsigned int h_counter = 0; unsigned int *d_source = NULL; uint2 *d_target = NULL; unsigned int *d_padrao1 = NULL; unsigned int *d_padrao2 = NULL; unsigned int width, height; //unsigned int *h_debug = NULL; unsigned int *d_debug = NULL; // Load files int loaded = cutLoadPGMi(PADRAO1_PGM, &h_padrao1, &width, &height); if (!loaded) { fprintf(stderr, "Can't touch this!\n"); return 1; } loaded = cutLoadPGMi(PADRAO2_PGM, &h_padrao2, &width, &height); if (!loaded) { fprintf(stderr, "Can't touch this!\n"); return 1; } loaded = cutLoadPGMi(ENTRADA_PGM, &h_image, &width, &height); if (!loaded) { fprintf(stderr, "Can't touch this!\n"); return 1; } int memsize = width*height*sizeof(unsigned int); // host -> host // host -> device h_target = (uint2*)malloc(10*sizeof(uint2)); cudaMalloc((void **)&d_source, memsize); cudaMalloc((void **)&d_target, 10*sizeof(uint2)); cudaMalloc((void **)&d_padrao1, 25*sizeof(unsigned int)); cudaMalloc((void **)&d_padrao2, 25*sizeof(unsigned int)); //cudaMalloc((void **)&d_counter, sizeof(unsigned int)); cudaMemset(&d_counter, 0, sizeof(unsigned int)); cudaMalloc((void **)&d_debug, memsize); TimerWin32 timer; timer.reset(); // resets timer cudaMemcpy(d_source, h_image, memsize, cudaMemcpyHostToDevice); cudaMemcpy(d_padrao2, h_padrao2, 25*sizeof(unsigned int), cudaMemcpyHostToDevice); //TODO dim3 gridDim(width/BLOCK_X, height/BLOCK_Y); dim3 blockDim(BLOCK_X, BLOCK_Y); kernel<<>>(d_target, d_debug, d_source, width, d_padrao2); cudaDeviceSynchronize(); cudaMemcpy(h_target, d_target, 10*sizeof(uint2),cudaMemcpyDeviceToHost); cudaMemcpy(&h_counter, &d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost); // XXX cudaMemcpy(h_image, d_debug, memsize, cudaMemcpyDeviceToHost); unsigned long time = timer.getMicroseconds(); std::cout << "Blablabla time: " << (static_cast(time)/1000.f) << " milliseconds." << std::endl; std::cout<