#include #include #include // Compute C=AB using namespace std; typedef float real_type; void print_time(const char* st, const boost::timer& timer, real_type* c, unsigned N) { double time= timer.elapsed(), mflops= 2*N*N*N / time / 1e6; cout << st << time <<"s, " << mflops << " MFLOPS\n"; if (fabs(c[3*N+7] - 6.0 * N) > 0.01) cout << "wrong result\n"; } int main(int argc, char** argv) { unsigned N= 512, B= 32; if (argc > 1) { N= atoi(argv[1]); B= atoi(argv[2]); } unsigned blocks= N / B, NS= N*N; real_type *a= new real_type[NS], *b= new real_type[NS], *c= new real_type[NS]; fill(&a[0], &a[NS], 2.0); fill(&b[0], &b[NS], 3.0); boost::timer t1; fill(&c[0], &c[NS], 0.0); for (unsigned ii= 0; ii < blocks; ++ii) for (unsigned jj= 0; jj < blocks; ++jj) for (unsigned kk= 0; kk < blocks; ++kk) for (unsigned i= ii*B, i_end= (ii+1)*B; i < i_end; ++i) for (unsigned j= jj*B, j_end= (jj+1)*B; j < j_end; ++j) for (unsigned k= kk*B, k_end= (kk+1)*B; k < k_end; ++k) c[i*N+k]+= a[i*N+j] * b[j*N+k]; print_time("i, j, k = ", t1, c, N); boost::timer t2; fill(&c[0], &c[NS], 0.0); for (unsigned ii= 0; ii < blocks; ++ii) for (unsigned kk= 0; kk < blocks; ++kk) for (unsigned jj= 0; jj < blocks; ++jj) for (unsigned i= ii*B, i_end= (ii+1)*B; i < i_end; ++i) for (unsigned k= kk*B, k_end= (kk+1)*B; k < k_end; ++k) for (unsigned j= jj*B, j_end= (jj+1)*B; j < j_end; ++j) c[i*N+k]+= a[i*N+j] * b[j*N+k]; print_time("i, k, j = ", t2, c, N); boost::timer t3; fill(&c[0], &c[NS], 0.0); for (unsigned jj= 0; jj < blocks; ++jj) for (unsigned ii= 0; ii < blocks; ++ii) for (unsigned kk= 0; kk < blocks; ++kk) for (unsigned j= jj*B, j_end= (jj+1)*B; j < j_end; ++j) for (unsigned i= ii*B, i_end= (ii+1)*B; i < i_end; ++i) for (unsigned k= kk*B, k_end= (kk+1)*B; k < k_end; ++k) c[i*N+k]+= a[i*N+j] * b[j*N+k]; print_time("j, i, k = ", t3, c, N); boost::timer t4; fill(&c[0], &c[NS], 0.0); for (unsigned jj= 0; jj < blocks; ++jj) for (unsigned kk= 0; kk < blocks; ++kk) for (unsigned ii= 0; ii < blocks; ++ii) for (unsigned j= jj*B, j_end= (jj+1)*B; j < j_end; ++j) for (unsigned k= kk*B, k_end= (kk+1)*B; k < k_end; ++k) for (unsigned i= ii*B, i_end= (ii+1)*B; i < i_end; ++i) c[i*N+k]+= a[i*N+j] * b[j*N+k]; print_time("j, k, i = ", t4, c, N); boost::timer t5; fill(&c[0], &c[NS], 0.0); for (unsigned kk= 0; kk < blocks; ++kk) for (unsigned ii= 0; ii < blocks; ++ii) for (unsigned jj= 0; jj < blocks; ++jj) for (unsigned k= kk*B, k_end= (kk+1)*B; k < k_end; ++k) for (unsigned i= ii*B, i_end= (ii+1)*B; i < i_end; ++i) for (unsigned j= jj*B, j_end= (jj+1)*B; j < j_end; ++j) c[i*N+k]+= a[i*N+j] * b[j*N+k]; print_time("k, i, j = ", t5, c, N); boost::timer t6; fill(&c[0], &c[NS], 0.0); for (unsigned kk= 0; kk < blocks; ++kk) for (unsigned jj= 0; jj < blocks; ++jj) for (unsigned ii= 0; ii < blocks; ++ii) for (unsigned k= kk*B, k_end= (kk+1)*B; k < k_end; ++k) for (unsigned j= jj*B, j_end= (jj+1)*B; j < j_end; ++j) for (unsigned i= ii*B, i_end= (ii+1)*B; i < i_end; ++i) c[i*N+k]+= a[i*N+j] * b[j*N+k]; print_time("k, j, i = ", t6, c, N); return 0; }