// Blocked matrix multiplication // Blocking factor B must be a multiple of 4 (positive multiple ;-) ) // Matrix size N must be a multiple of B (positive multiple again ;-) ) #include #include #include // Compute C=AB using namespace std; typedef double real_type; real_type result_i_j (real_type i, real_type j, real_type N) { return 1.0/3.0 * N * (1.0 - 3*i - 3*j + 6*i*j - 3*N + 3*i*N + 3*j*N + 2*N*N); } void print_time(const char* st, const boost::timer& timer, real_type* c, unsigned N) { double time= timer.elapsed(), mflops= 2*N*N*N / time / 1e6; cout << st << time <<"s, " << mflops << " MFLOPS\n"; // some tests if (fabs(c[3*N+2] - result_i_j(3, 2, N)) > 0.01) cout << "wrong result: c[3, 2] = " << c[3*N+2] << ", should be " << result_i_j(3, 2, N) << "\n"; if (N>= 7 && fabs(c[5*N+7] - result_i_j(5, 7, N)) > 0.01) cout << "wrong result: c[5, 7] = " << c[5*N+7] << ", should be " << result_i_j(5, 7, N) << "\n"; if (N>= 128 && fabs(c[55*N+127] - result_i_j(55, 127, N)) > 0.01) cout << "wrong result: c[55, 127] = " << c[55*N+127] << ", should be " << result_i_j(55, 127, N) << "\n"; } void fill_matrix(real_type factor, real_type *m, unsigned N) { for (unsigned i= 0; i < N; ++i) for (unsigned j= 0; j < N; ++j) m[i*N+j]= factor * (real_type(i) + real_type(j)); } int main(int argc, char** argv) { unsigned N= 544, B= 32; if (argc > 1) { N= atoi(argv[1]); B= atoi(argv[2]); } unsigned blocks= N / B, NS= N*N; if (B % 4) { cerr << "I said B must be multiple of 4!!!!\n"; return 1; } if (N % B) { cerr << "I said N must be multiple of B!!!!\n"; return 1; } real_type *a= new real_type[NS], *b= new real_type[NS], *c= new real_type[NS]; fill_matrix(1., a, N); fill_matrix(2., b, N); if (N>= 7) cout << "a[3, 7] = " << a[3*N+7] << ", b[3, 7] = " << b[3*N+7] << "\n"; boost::timer t1; fill(&c[0], &c[NS], 0.0); for (unsigned ii= 0; ii < blocks; ++ii) for (unsigned jj= 0; jj < blocks; ++jj) for (unsigned kk= 0; kk < blocks; ++kk) for (unsigned i= ii*B, i_end= (ii+1)*B; i < i_end; ++i) for (unsigned j= jj*B, j_end= (jj+1)*B; j < j_end; ++j) for (unsigned k= kk*B, k_end= (kk+1)*B; k < k_end; k= k+4) { unsigned index_a= i*N+j, index_b= j*N+k, index_c= i*N+k; c[index_c]+= a[index_a] * b[index_b]; c[index_c+1]+= a[index_a] * b[index_b+1]; c[index_c+2]+= a[index_a] * b[index_b+2]; c[index_c+3]+= a[index_a] * b[index_b+3]; } print_time("i, j, k = ", t1, c, N); boost::timer t2; fill(&c[0], &c[NS], 0.0); for (unsigned ii= 0; ii < blocks; ++ii) for (unsigned kk= 0; kk < blocks; ++kk) for (unsigned jj= 0; jj < blocks; ++jj) for (unsigned i= ii*B, i_end= (ii+1)*B; i < i_end; ++i) for (unsigned k= kk*B, k_end= (kk+1)*B; k < k_end; ++k) { real_type tmp0= 0.0, tmp1= 0.0, tmp2= 0.0, tmp3= 0.0; for (unsigned j= jj*B, j_end= (jj+1)*B; j < j_end; j= j+4) { unsigned index_a= i*N+j, index_b= j*N+k; tmp0+= a[index_a] * b[index_b]; tmp1+= a[index_a+1] * b[index_b+N]; tmp2+= a[index_a+2] * b[index_b+2*N]; tmp3+= a[index_a+3] * b[index_b+3*N]; } c[i*N+k]+= tmp0 + tmp1 + tmp2 + tmp3; } print_time("i, k, j = ", t2, c, N); return 0; }