Unit 3.3.5 Implementation: five loops around the micro-kernel, with packing
ΒΆ
Below we illustrate how packing is incorporated into the "Five loops around the micro-kernel".
data:image/s3,"s3://crabby-images/d6f84/d6f8401c943c827cb7b0e9e4ed805de6f9cba2e7" alt=""
void LoopFive( int m, int n, int k, double *A, int ldA, double *B, int ldB, double *C, int ldC ) { for ( int j=0; j<n; j+=NC ) { int jb = min( NC, n-j ); /* Last loop may not involve a full block */ LoopFour( m, jb, k, A, ldA, &beta( 0,j ), ldB, &gamma( 0,j ), ldC ); } }
data:image/s3,"s3://crabby-images/d46cf/d46cf9cc7befd6c3c3a8b5a55994d32df3a22e0a" alt=""
void LoopFour( int m, int n, int k, double *A, int ldA, double *B, int ldB, double *C, int ldC ) { double *Btilde = ( double * ) malloc( KC * NC * sizeof( double ) ); for ( int p=0; p<k; p+=KC ) { int pb = min( KC, k-p ); /* Last loop may not involve a full block */ PackPanelB_KCxNC( pb, n, &beta( p, 0 ), ldB, Btilde ); LoopThree( m, n, pb, &alpha( 0, p ), ldA, Btilde, C, ldC ); } free( Btilde); }
data:image/s3,"s3://crabby-images/7e8f4/7e8f4c810e1bb11e6664c2b6ffcf2dda80926818" alt=""
void LoopThree( int m, int n, int k, double *A, int ldA, double *Btilde, double *C, int ldC ) { double *Atilde = ( double * ) malloc( MC * KC * sizeof( double ) ); for ( int i=0; i<m; i+=MC ) { int ib = min( MC, m-i ); /* Last loop may not involve a full block */ PackBlockA_MCxKC( ib, k, &alpha( i, 0 ), ldA, Atilde ); LoopTwo( ib, n, k, Atilde, Btilde, &gamma( i,0 ), ldC ); } free( Atilde); }
data:image/s3,"s3://crabby-images/7934d/7934d00e044775f85a83e042d94394102bafe83a" alt=""
void LoopTwo( int m, int n, int k, double *Atilde, double *Btilde, double *C, int ldC ) { for ( int j=0; j<n; j+=NR ) { int jb = min( NR, n-j ); LoopOne( m, jb, k, Atilde, &Btilde[ j*k ], &gamma( 0,j ), ldC ); }
data:image/s3,"s3://crabby-images/b2208/b2208c2319ae62dbd5f92bd72c75424c55325ac7" alt=""
void LoopOne( int m, int n, int k, double *Atilde, double *Micro-PanelB, double *C, int ldC ) { for ( int i=0; i<m; i+=MR ) { int ib = min( MR, m-i ); Gemm_MRxNRKernel_Packed( k,Ã[ i*k ], Micro-PanelB, &gamma( i,0 ), ldC ); } }