Thursday, 15 March 2012

c++ - Performance degradation if loop count is not known at compile time on Xeon Phi -



c++ - Performance degradation if loop count is not known at compile time on Xeon Phi -

i creating simple matrix multiplication procedure, operating on intel xeon phi architecture.

after many attempts autovectorization, trying improve performances, had utilize intel intrinsics.

until now, matrix size given #define in source code, when seek give @ run time, have huge performance degradation.

the source code following:

#include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> #include <stddef.h> #include <chrono> #include <ctime> #include <mmintrin.h> #include <xmmintrin.h> // sse #include <pmmintrin.h> // sse2 #include <emmintrin.h> // sse3 #include <immintrin.h> #include <zmmintrin.h> #define alignment 64 #ifndef size #define size 960 #endif #define vzero(c) {(c) = _mm512_setzero_pd();} #define start_time() \ auto start = std::chrono::high_resolution_clock::now(); /** shows elapsed time. see start_time usage*/ #define elapsed_time(string) \ auto elapsed = std::chrono::high_resolution_clock::now() - start; \ long long microseconds = std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count(); \ printf(#string":%lld\n", microseconds); void rectranspose(double *__restrict__ a, double *__restrict__ at, const int n, const int k, const int lda, const int ldat){ if (n*k <= 128) { for(int = 0; < n; i++) { for(int j = 0; j < k; j++) { at[j*ldat+i] = a[i*lda+j]; } } //printf("reached _|_"); return; } if(k > n) { rectranspose(a, at, n, (k+1)/2, lda, ldat); rectranspose(&a[(k+1)/2], &at[(k+1)/2*ldat], n, k-((k+1)/2), lda, ldat); } else { rectranspose(a, at, (n+1)/2, k, lda, ldat); rectranspose(&a[(n+1)/2*lda], &at[(n+1)/2], n- (n+1)/2, k, lda, ldat); } } /** calculates 8 cols , 30 rows of c.*/ inline void eightbythirty(double *__restrict__ a, double *__restrict__ b, double * __restrict__ c, const int size) { __m512d c0, c1, c2, c3, c4, c5, c6, c7, c8, c9; __m512d c10, c11, c12, c13, c14, c15, c16, c17, c18, c19; __m512d c20, c21, c22, c23, c24, c25, c26, c27, c28, c29; vzero(c0); vzero(c1); vzero(c2); vzero(c3); vzero(c4); vzero(c5); vzero(c6); vzero(c7); vzero(c8); vzero(c9); vzero(c10); vzero(c11); vzero(c12); vzero(c13); vzero(c14); vzero(c15); vzero(c16); vzero(c17); vzero(c18); vzero(c19); vzero(c20); vzero(c21); vzero(c22); vzero(c23); vzero(c24); vzero(c25); vzero(c26); vzero(c27); vzero(c28); vzero(c29); __assume_aligned(a, alignment); __assume_aligned(b, alignment); __assume_aligned(c, alignment); __assume(size%16==0); for(int = 0; < size; i++) { const __m512d bv = _mm512_load_pd(b+i*size); c0 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+0, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c0); c1 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+1, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c1); c2 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+2, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c2); c3 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+3, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c3); c4 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+4, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c4); c5 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+5, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c5); c6 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+6, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c6); c7 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+7, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c7); c8 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+8, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c8); c9 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+9, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c9); c10 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+10, _mm_upconv_pd_none, _mm_broadcast_1x8, 0),bv, c10); c11 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+11, _mm_upconv_pd_none, _mm_broadcast_1x8, 0),bv, c11); c12 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+12, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c12); c13 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+13, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c13); c14 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+14, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c14); c15 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+15, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c15); c16 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+16, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c16); c17 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+17, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c17); c18 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+18, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c18); c19 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+19, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c19); c20 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+20, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c20); c21 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+21, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c21); c22 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+22, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c22); c23 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+23, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c23); c24 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+24, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c24); c25 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+25, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c25); c26 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+26, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c26); c27 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+27, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c27); c28 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+28, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c28); c29 = _mm512_fmadd_pd(_mm512_extload_pd(a+i*size+29, _mm_upconv_pd_none, _mm_broadcast_1x8, 0), bv, c29); } _mm512_storenr_pd(c+0*size, c0); _mm512_storenr_pd(c+1*size, c1); _mm512_storenr_pd(c+2*size, c2); _mm512_storenr_pd(c+3*size, c3); _mm512_storenr_pd(c+4*size, c4); _mm512_storenr_pd(c+5*size, c5); _mm512_storenr_pd(c+6*size, c6); _mm512_storenr_pd(c+7*size, c7); _mm512_storenr_pd(c+8*size, c8); _mm512_storenr_pd(c+9*size, c9); _mm512_storenr_pd(c+10*size, c10); _mm512_storenr_pd(c+11*size, c11); _mm512_storenr_pd(c+12*size, c12); _mm512_storenr_pd(c+13*size, c13); _mm512_storenr_pd(c+14*size, c14); _mm512_storenr_pd(c+15*size, c15); _mm512_storenr_pd(c+16*size, c16); _mm512_storenr_pd(c+17*size, c17); _mm512_storenr_pd(c+18*size, c18); _mm512_storenr_pd(c+19*size, c19); _mm512_storenr_pd(c+20*size, c20); _mm512_storenr_pd(c+21*size, c21); _mm512_storenr_pd(c+22*size, c22); _mm512_storenr_pd(c+23*size, c23); _mm512_storenr_pd(c+24*size, c24); _mm512_storenr_pd(c+25*size, c25); _mm512_storenr_pd(c+26*size, c26); _mm512_storenr_pd(c+27*size, c27); _mm512_storenr_pd(c+28*size, c28); _mm512_storenr_pd(c+29*size, c29); } int main(int argc, const char ** argv) { #ifdef sizes const int size = size; #else const int size = atoi(argv[1]); #endif void* p = malloc((sizeof(double)*5*size*size) + alignment-1); double *__restrict__ = (double*)(((size_t)p + alignment-1) / alignment * alignment); double *__restrict__ @ = (double*) a+size*size; double *__restrict__ b = at+size*size; double *__restrict__ c = b+size*size; double *__restrict__ d = c+size*size; srand(time(null)); for(int = 0; < size; i++) { for(int j = 0; j < size; j++) { a[i*size+j] = (double) (rand()%20); } for(int j2=0; j2<size; j2++){ c[i*size+j2] = 0.0; } } for(int = 0; < size; i++) { for(int j = 0; j < size; j++) { b[i*size+j] = (double) (rand()%20); } } start_time(); rectranspose(a, at, size, size, size, size); for(int = 0; < size; i+=30) { for(int j = 0; j < size; j+=8) { eightbythirty(&at[i], &b[j], &c[i*size+j], size); } } elapsed_time(); double gflops = 2.0*size*size*size*1.0e-03/(microseconds); printf("gflops: %f\n", gflops); for(int = 0; < size; i++) { for(int j = 0; j < size; j++) { double s = 0; for(int u = 0; u < size; u++) { s += a[i*size+u] * b[u*size+j]; } d[i*size+j] = s; } } int error = 0; for(int = 0; < size; i++) { for(int j = 0; j < size; j++) { if(abs(c[i*size+j] - d[i*size+j]) > 1) { printf("error @ %d %d , %f instead of %f\n", i, j, c[i*size+j], d[i*size+j]); error++; if(error > 16) homecoming 0; } } } printf("ok\n"); }

so example, having size 960 (for works sizes multiples of 30*8):

if compile compile time given size: icc -mmic -o3 -restrict -std=c++11 -dsizes -dsize=960 mmul.cpp -o mmul.o

elapsed time: 0.460745s gflops: 3.840458

if compile runtime given size: icc -mmic -o3 -restrict -std=c++11 mmul.cpp -o mmul.o

elapsed time: 2.204564s gflops: 0.802640

i'm thinking prefetching issue icc can't recognize memory access pattern. looking @ generated asm source, number of vprefetch instructions much more higher in "compile time" version.

funny fact: check right result of multiplication (the 2 loops @ end of code, rows 178-197) much more slower in compile time version!

any thoughts? tried #pragma loop_count seems it's useless, doing manual intrinsic prefetching doesn't seem effective.

thanks in advance answer.

regards, luca

the fundamental theorem of computer science states problem can solved layer of indirection.

the thought leave leave code fixed size loops, , write code dispatch right fixed size loop.

first alter eightbythirty read this:

template<int size> inline void eightbythirty(double *__restrict__ a, double *__restrict__ b, double * __restrict__ c) {

with identical implementation inside. set in namespace details isn't meant user-facing usually.

next, wrap it:

inline void eightbythirty(double *__restrict__ a, double *__restrict__ b, double * __restrict__ c, const int size_divided_by_240) { int size=size_divided_by_240; switch( size&7 ) { case 0: break; case 01: eightbythirty<01>(a,b,c); break; case 02: eightbythirty<02>(a,b,c); break; case 03: eightbythirty<03>(a,b,c); break; case 04: eightbythirty<04>(a,b,c); break; case 05: eightbythirty<05>(a,b,c); break; case 06: eightbythirty<06>(a,b,c); break; case 07: eightbythirty<07>(a,b,c); break; } a+=(size&7)*8*30; b+=(size&7)*8*30; c+=(size&7)*8*30; switch( (size>>3)&7 ) { case 0: break; case 01: eightbythirty<1*8>(a,b,c); break; case 02: eightbythirty<2*8>(a,b,c); break; case 03: eightbythirty<3*8>(a,b,c); break; case 04: eightbythirty<4*8>(a,b,c); break; case 05: eightbythirty<5*8>(a,b,c); break; case 06: eightbythirty<6*8>(a,b,c); break; case 07: eightbythirty<7*8>(a,b,c); break; } += (size&(7<<3))*8*30; b += (size&(7<<3))*8*30; c += (size&(7<<3))*8*30; switch( (size>>6)&7 ) { case 0: break; case 01: eightbythirty<1*8*8>(a,b,c); break; case 02: eightbythirty<2*8*8>(a,b,c); break; case 03: eightbythirty<3*8*8>(a,b,c); break; case 04: eightbythirty<4*8*8>(a,b,c); break; case 05: eightbythirty<5*8*8>(a,b,c); break; case 06: eightbythirty<6*8*8>(a,b,c); break; case 07: eightbythirty<7*8*8>(a,b,c); break; default: } += (size&(7<<6))*8*30; b += (size&(7<<6))*8*30; c += (size&(7<<6))*8*30; int steps = size/8/8/8; for( int = 0; < steps; ++i ) { eightbythirty<512>(a+512*i, b+512*i, c+512*i); } }

this breaks input size 3 bit chunks. invokes fixed-size implementations. 4 branches occur in above code, much of simple jump tables, values less 512*8*30. values greater that, things done in chunks of 512*8*30.

7*3+1 = 22 implementations of original function implemented, each constant size, compiler can optimize them.

this can done generically metaprogramming, isn't worth one-off use.

i may missing *(8*30) in above code when phone call <int size> version of eightbythirty.

c++ c++11 intrinsics xeon-phi

No comments:

Post a Comment