My Blog: c++ - OpenCL computation does not match output of sequential algorithm -

c++ - OpenCL computation does not match output of sequential algorithm -

i'm trying implement naive version of lu decomposition in opencl. start, have implemented sequential version in c++ , constructed methods verify result (i.e., multiplication methods). next implemented algorithm in kernel , tested manually verified input (i.e., 5x5 matrix). works fine.

however, when run algorithm on randomly generated matrix bigger 5x5 unusual results. i've cleaned code, checked calculations manually can't figure out kernel going wrong. i'm starting think might have floats , stability of calculations. mean error margins propagated , bigger , bigger. i'm well-aware can swap rows biggest pivot value , such, error margin way off sometimes. , in case have expected result - albeit wrong 1 - same sequential algorithm. help identifying doing wrong.

i'm using single dimensional array addressing matrix 2 dimensions happens this:

a(row, col) = a[row * matrix_width + col].

about results might add together decided merge l , u matrix one. given l , u:

l: u: 1 0 0 b c x 1 0 0 d e y z 1 0 0 f

i display them as:

a: b c x d e y z f

the kernel following:

the parameter source original matrix want decompose. parameter destin destination. matrix_size total size of matrix (so 9 3x3) , matrix_width width (3 3x3 matrix).

__kernel void matrix(     __global float * source,     __global float * destin,     unsigned int   matrix_size,     unsigned int   matrix_width     ) {     unsigned int index = get_global_id(0);     int col_idx = index % matrix_width;     int row_idx = index / matrix_width;      if (index >= matrix_size)         return;      // first of all,  re-create our value destination.     destin[index] = source[index];     // iterate on pivots.     for(int piv_idx = 0; piv_idx < matrix_width; piv_idx++)     {         // have row below pivot row         // , have column of pivot         // or right of column.         if(col_idx < piv_idx || row_idx <= piv_idx)             return;         // calculate divisor.         float pivot_value       = destin[(piv_idx * matrix_width) + piv_idx];         float below_pivot_value = destin[(row_idx * matrix_width) + piv_idx];         float divisor           = below_pivot_value/ pivot_value;          // value in pivot row on column.         float pivot_row_value = destin[(piv_idx * matrix_width) + col_idx];         float current_value   = destin[index];         destin[index]         = current_value - (pivot_row_value * divisor);           // write divisor memory (we won't  utilize these values anymore!)         // if value under pivot.         barrier(clk_global_mem_fence);         if(col_idx == piv_idx)         {             int divisor_location = (row_idx * matrix_width) + piv_idx;             destin[divisor_location] = divisor;         }         barrier(clk_global_mem_fence);     } }

this sequential version:

// decomposes matrix l , u in same matrix. float * decompose(float* a, int matrix_width) {     int total_length = matrix_width*matrix_width;     float *u = new float[total_length];      (int = 0; < total_length; i++)     {         u[i] = a[i];     }     (int row = 0; row < matrix_width; row++)     {         int pivot_idx = row;         float pivot_val = u[pivot_idx * matrix_width + pivot_idx];          (int r = row + 1; r < matrix_width; r++)         {             float below_pivot = u[r*matrix_width + pivot_idx];             float divisor = below_pivot / pivot_val;              (int row_idx = pivot_idx; row_idx < matrix_width; row_idx++)             {                 float value = u[row * matrix_width + row_idx];                 u[r*matrix_width + row_idx] = u[r*matrix_width + row_idx] - (value * divisor);             }             u[r * matrix_width + pivot_idx] = divisor;         }     }      homecoming u; }

an illustration output following:

workgroup size: 1 array dimension: 6 original unfactorized: |     176.000000 |     133.000000 |     431.000000 |     839.000000 |     739.000000 |     450.000000 | |     507.000000 |     718.000000 |     670.000000 |     753.000000 |     122.000000 |     941.000000 | |     597.000000 |     449.000000 |     596.000000 |     742.000000 |     491.000000 |     212.000000 | |     159.000000 |     944.000000 |     797.000000 |     717.000000 |     822.000000 |     219.000000 | |     266.000000 |     755.000000 |      33.000000 |     231.000000 |     824.000000 |     785.000000 | |     724.000000 |     408.000000 |     652.000000 |     863.000000 |     663.000000 |     113.000000 | sequential: |     176.000000 |     133.000000 |     431.000000 |     839.000000 |     739.000000 |     450.000000 | |       2.880682 |     334.869324 |    -571.573853 |   -1663.892090 |   -2006.823730 |    -355.306763 | |       3.392045 |      -0.006397 |    -869.627747 |   -2114.569580 |   -2028.558716 |   -1316.693359 | |       0.903409 |       2.460203 |      -2.085742 |    -357.893066 |     860.526367 |   -2059.689209 | |       1.511364 |       1.654343 |      -0.376231 |      -2.570729 |    4476.049805 |   -5097.599121 | |       4.113636 |      -0.415427 |       1.562076 |      -0.065806 |       0.003290 |      52.263515 | sequential multiplied matching original?: 1 gpu: |     176.000000 |     133.000000 |     431.000000 |     839.000000 |     739.000000 |     450.000000 | |       2.880682 |     334.869293 |    -571.573914 |   -1663.892212 |   -2006.823975 |    -355.306885 | |       3.392045 |      -0.006397 |    -869.627808 |   -2114.569580 |   -2028.558716 |   -1316.693359 | |       0.903409 |       2.460203 |      -2.085742 |    -357.892578 |    5091.575684 |   -2059.688965 | |       1.511364 |       1.654343 |      -0.376232 |      -2.570732 |   16116.155273 |   -5097.604980 | |       4.113636 |      -0.415427 |      -0.737347 |       2.005755 |      -3.655331 |    -237.480438 | gpu multiplied matching original?: values differ: 5053.05 -- 822 0 values differ: 5091.58 -- 860.526  right solution? 0

edit

okay, understand why not working before, think. reason synchronize on each workgroup. when phone call kernel workgroup size equal number of items in matrix correct, because barriers work properly. however, decided go approach mentioned in comments. enqueue multiple kernels , wait each kernel finish before starting next one. map onto iteration on each row of matrix , multiplying pivot element. makes sure not modify or read elements beingness modified kernel @ point.

again, works little matrices. think wrong in assuming synchronization only. per request of baiz posting entire main here calls kernel:

int main(int argc, char *argv[]) {      seek {         if (argc != 5) {             std::ostringstream oss;             oss << "usage: " << argv[0] << " <kernel_file> <kernel_name> <workgroup_size> <array width>";             throw std::runtime_error(oss.str());         }         // read in arguments.         std::string kernel_file(argv[1]);         std::string kernel_name(argv[2]);         unsigned int workgroup_size = atoi(argv[3]);         unsigned int array_dimension = atoi(argv[4]);         int total_matrix_length = array_dimension * array_dimension;         // print parameters         std::cout << "workgroup size: " << workgroup_size << std::endl;         std::cout << "array dimension: " << array_dimension << std::endl;          // create matrix work on.         // create random array.         int matrix_width = sqrt(total_matrix_length);         float* input_matrix = new float[total_matrix_length];         input_matrix = randommatrix(total_matrix_length);          /// debugging         //float* input_matrix = new float[9];         //int matrix_width = 3;         //total_matrix_length = matrix_width * matrix_width;         //input_matrix[0] = 10; input_matrix[1] = -7; input_matrix[2] = 0;         //input_matrix[3] = -3; input_matrix[4] =  2; input_matrix[5] = 6;         //input_matrix[6] =  5; input_matrix[7] = -1; input_matrix[8] = 5;          // allocate memory on host , populate source         float *gpu_result = new float[total_matrix_length];           // opencl initialization         std::vector<cl::platform> platforms;         std::vector<cl::device> devices;         cl::platform::get(&platforms);         platforms[0].getdevices(cl_device_type_gpu, &devices);         cl::context context(devices);         cl::commandqueue queue(context, devices[0], cl_queue_profiling_enable);          // load kernel source.         std::string file_text;         std::ifstream file_stream(kernel_file.c_str());         if (!file_stream) {             std::ostringstream oss;             oss << "there no file called " << kernel_file;             throw std::runtime_error(oss.str());         }         file_text.assign(std::istreambuf_iterator<char>(file_stream), std::istreambuf_iterator<char>());          // compile kernel source.         std::string source_code = file_text;         std::pair<const char *, size_t> source(source_code.c_str(), source_code.size());         cl::program::sources sources;         sources.push_back(source);         cl::program program(context, sources);          seek {             program.build(devices);         }          grab (cl::error& e) {             std::string msg;             program.getbuildinfo<std::string>(devices[0], cl_program_build_log, &msg);             std::cerr << "your kernel failed compile" << std::endl;             std::cerr << "-----------------------------" << std::endl;             std::cerr << msg;             throw(e);         }          // allocate memory on device         cl::buffer source_buf(context, cl_mem_read_only, total_matrix_length*sizeof(float));         cl::buffer dest_buf(context, cl_mem_write_only, total_matrix_length*sizeof(float));          // create actual kernel.         cl::kernel kernel(program, kernel_name.c_str());          // transfer source   info host device         queue.enqueuewritebuffer(source_buf, cl_true, 0, total_matrix_length*sizeof(float), input_matrix);          (int pivot_idx = 0; pivot_idx < matrix_width; pivot_idx++)         {             // set kernel arguments             kernel.setarg<cl::memory>(0, source_buf);             kernel.setarg<cl::memory>(1, dest_buf);             kernel.setarg<cl_uint>(2, total_matrix_length);             kernel.setarg<cl_uint>(3, matrix_width);             kernel.setarg<cl_int>(4, pivot_idx);              // execute code on device             std::cout << "enqueueing new kernel " << pivot_idx << std::endl;             cl::event evt;             queue.enqueuendrangekernel(kernel, cl::nullrange, cl::ndrange(total_matrix_length), cl::ndrange(workgroup_size), 0, &evt);             evt.wait();             std::cout << "iteration " << pivot_idx << " done" << std::endl;         }          // transfer destination   info device host         queue.enqueuereadbuffer(dest_buf, cl_true, 0, total_matrix_length*sizeof(float), gpu_result);          // calculate sequentially.         float* sequential = decompose(input_matrix, matrix_width);          // print out results.         std::cout << "sequential:\n";         printmatrix(total_matrix_length, sequential);          // print out results.         std::cout << "gpu:\n";         printmatrix(total_matrix_length, gpu_result);          std::cout << "correct solution? " << equalmatrices(gpu_result, sequential, total_matrix_length);           // compute   info throughput in gb/s         //float throughput = (2.0*total_matrix_length*sizeof(float)) / t; // t in nano seconds         //std::cout << "achieved throughput: " << throughput << std::endl;          // cleanup         // deallocate memory         delete[] gpu_result;         delete[] input_matrix;         delete[] sequential;          homecoming 0;     }      grab (cl::error& e) {         std::cerr << e.what() << ": " << jc::readable_status(e.err());          homecoming 3;     }      grab (std::exception& e) {         std::cerr << e.what() << std::endl;          homecoming 2;     }      grab (...) {         std::cerr << "unexpected error. aborting!\n" << std::endl;          homecoming 1;     } }

as mazzzu stated, due parallel execution of work items can not sure if element in array has been read/written yet. can ensured using

clk_local_mem_fence/clk_global_mem_fence

however these mechanisms work on threads wihtin same work group. there no possibility synchronize work items different work groups.

your problem is:

you utilize multiple work groups algorithm executable single work group you not utilize plenty barriers

if utilize single work group, seek adding a

barrier(clk_global_mem_fence);

to parts read/write from/to destin.

you should restructure algorithm:

have 1 work grouping perform algorithm on matrix use local memory improve performance(since repeatedly access elements) use barriers everywhere. if algorithm works can start removing them after working out, ones don't need.

could post kernel phone call , working sizes?

edit:

from algorithm came code. haven't tested , uncertainty it'll work right away. should help in understanding how parallelize sequential algorithm. decompose matrix 1 kernel launch.

some restrictions:

this code works single work group. it work matrices size not exceed maximum local work-group size (probably between 256 , 1024). if want alter that, should refactor algorithm utilize many work items width of matrix.

just adapt them kernel.setarg(...) code

int nbelements = width*height; clsetkernelarg (kernel, 0, sizeof(a), &a); clsetkernelarg (kernel, 1, sizeof(u), &u); clsetkernelarg (kernel, 2, sizeof(float) * widthmat * heightmat, null); // local memory clsetkernelarg (kernel, 3, sizeof(int), &width); clsetkernelarg (kernel, 4, sizeof(int), &height); clsetkernelarg (kernel, 5, sizeof(int), &nbelements);

kernel code:

inline int indexfrom2d(const int u, const int v, const int width) {      homecoming width*v + u; }  kernel void decompose(global float* a,                        global float* u,                       local float* localbuffer,                        const int widthmat,                       const int heightmat,                       const int nbelements) {     int gidx = get_global_id(0);     int col = gidx%widthmat;     int row = gidx/widthmat;      if(gidx >= nbelements)         return;      //  re-create global local memory     localbuffer[gidx] = a[gidx];      // sync  re-create process     barrier(clk_local_mem_fence);      (int rowouter = 0; rowouter < widthmat; ++rowouter)     {         int pivotidx = rowouter;          float pivotvalue = localbuffer[indexfrom2d(pivotidx, pivotidx, widthmat)];          //   info work items in row         float belowprivot = localbuffer[indexfrom2d(pivotidx, row, widthmat)];                   float divisor = belowprivot / pivotvalue;          float value = localbuffer[indexfrom2d(col, rowouter, widthmat)];          // work items below pivot , pivot right         if( widthmat > col >= pivotidx &&             heightmat > row >= pivotidx + 1)         {             localbuffer[indexfrom2d(col, row, widthmat)] = localbuffer[indexfrom2d(col, row, widthmat)] - (value * divisor);              if(col == pivotidx)                 localbuffer[indexfrom2d(pivotidx, row, widthmat)] = divisor;         }         barrier(clk_local_mem_fence);     }      // write global memory     u[gidx] = localbuffer[gidx]; }

c++ algorithm matrix opencl

My Blog

Tuesday, 15 June 2010

c++ - OpenCL computation does not match output of sequential algorithm -

No comments:

Post a Comment