Head

GCC Code Coverage Report

Directory:	.		Exec	Total	Coverage
File:	frame/primitives/rank_k.hpp	Lines:	0	32	0.0 %
Date:	2019-01-14	Branches:	0	100	0.0 %


/**
 *  HMLP (High-Performance Machine Learning Primitives)
 *
 *  Copyright (C) 2014-2017, The University of Texas at Austin
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program. If not, see the LICENSE file.
 *
 **/



#ifndef RANK_K_HPP
#define RANK_K_HPP

#include <assert.h>
#include <typeinfo>
#include <algorithm>

#include <hmlp.h>
#include <hmlp_internal.hpp>
#include <hmlp_base.hpp>

/** reference microkernels */
#include <packing.hpp>
#include <semiring_mrxnr.hpp>
#include <fused_mrxnr.hpp>

using namespace std;
using namespace hmlp;

namespace hmlp
{
/**
 *  @brief Macro kernel contains the 3rd and 2nd loops. Depending on the
 *         configuration of the communicator, the 3rd loop may be parallelized.
 *         b_next is the prefetch pointer.
 */
template<int KC, typename SEMIRINGKERNEL, typename TA, typename TB, typename TV>
void rank_k_macro_kernel
(
  tci::Comm &Comm3rd,
  int ic, int jc, int pc,
  int  m, int  n, int  k,
  TA *packA,
  TB *packB,
  TV *V, int rs_v, int cs_v,
  SEMIRINGKERNEL semiringkernel
)
{
  /** Get all block sizes */
  const static int MR         = SEMIRINGKERNEL::mr;
  const static int NR         = SEMIRINGKERNEL::nr;
  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
  /** Create subcommunicators for each loop. */
  auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
  /** Compute loop ranges for each thread */
  auto Loop3rd = Comm3rd.DistributeOver1DGangs(        0, n,      NR );
  auto Pack3rd = Comm3rd.DistributeOver1DGangs(        0, n, PACK_NR );
  auto Loop2nd = Comm2nd.DistributeOver1DThreads(      0, m,      MR );
  auto Pack2nd = Comm2nd.DistributeOver1DThreads(      0, m, PACK_MR );
  /** Distribute range [0,n) over Comm3rd (jr loop). */
  for ( int j  = Loop3rd.beg(), jp  = Pack3rd.beg();
            j  < Loop3rd.end();
            j += Loop3rd.inc(), jp += Pack3rd.inc() )
  {
    struct aux_s<TA, TB, TV, TV> aux;
    aux.pc       = pc;
    aux.b_next   = packB;
    aux.do_packC = 0;
    aux.jb       = std::min( n - j, NR );
    /** Distribute range [0,m) over Comm2nd (ir loop). */
    for ( int i  = Loop2nd.beg(), ip  = Pack2nd.beg();
              i  < Loop2nd.end();
              i += Loop2nd.inc(), ip += Pack2nd.inc() )
    {
      aux.ib = std::min( m - i, MR );
      /** Increase the b_next pointer. */
      if ( i + MR >= m ) aux.b_next += Pack3rd.inc() * k;

      if ( aux.jb == NR && aux.ib == MR )
      {
        semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
          &V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
      }
      else
      {
        TV vtmp[ MR * NR ];

        if ( pc ) // initilize ctmp
        {
          for ( auto jj = 0; jj < aux.jb; jj ++ )
            for ( auto ii = 0; ii < aux.ib; ii ++ )
              vtmp[ jj * MR + ii ] =
                V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
        }

        semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
          vtmp, 1, MR, &aux );

        for ( auto jj = 0; jj < aux.jb; jj ++ )
          for ( auto ii = 0; ii < aux.ib; ii ++ )
            V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ]
              = vtmp[ jj * MR + ii ];
      }
    } /** end 2nd loop */
  } /** end 3rd loop */
}; /** end rank_k_macro_kernel() */



/**
 *  @breif This function contains the loop body of the 6th to 4th loops,
 *         including all packing and unpacking routines. Notice that this
 *         function is executed by all threads in the root communicator.
 *         To access each thread in different level of communicators, use
 *         their ids.
 */
template<
  int MC, int NC, int KC,
  typename TPACKA, typename TPACKB, typename TV,
  typename     TA, typename     TB, typename TC,
  typename SEMIRINGKERNEL>
void rank_k_internal
(
  tci::Comm &Comm6th,
  int batchId, int m, int n, int k, int k_stra,
  TA& A,
  TB& B,
  TV* V, int rs_v, int cs_v,
  SEMIRINGKERNEL semiringkernel
)
{
  /** Get all block sizes. */
  const static int MR         = SEMIRINGKERNEL::mr;
  const static int NR         = SEMIRINGKERNEL::nr;
  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
  const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
  const static int PACK_MC    = ( MC / MR ) * PACK_MR;
  const static int PACK_NC    = ( NC / NR ) * PACK_NR;
  /** Create subcommunicators for each loop. */
  auto Comm5th = Comm6th.Split( hmlp_read_nway_from_env( "KS_JC_NT" ) );
  auto Comm4th = Comm5th.Split( 1 );
  auto Comm3th = Comm4th.Split( hmlp_read_nway_from_env( "KS_IC_NT" ) );
  /** Adjuest nc and pack_nc if the 6th loop is parallelized. */
  int nc = Comm6th.BalanceOver1DGangs( n, NC, NR );
  int pack_nc = ( nc / NR ) * PACK_NR;
  /** Allocate packB (shared over Comm4th, private for each Comm5th gang). */
  auto *packB = Comm4th.AllocateSharedMemory<ALIGN_SIZE, TPACKB>( KC * ( pack_nc + 1 ) );
  /** Allocate packA (shared over Comm3th, private for each Comm4th gang). */
  auto *packA = Comm3th.AllocateSharedMemory<ALIGN_SIZE, TPACKA>( KC * ( PACK_MC + 1 ) );
  /** Distribute range [0,n) over Comm6th. */
  auto Loop6th = Comm6th.DistributeOver1DGangs(      0, n, nc );
  /** Distribute range [k_stra,k) over Comm5th. */
  auto Loop5th = Comm5th.DistributeOver1DGangs( k_stra, k, KC );
  /** Distribute range [0,m) over Comm4th. */
  auto Loop4th = Comm4th.DistributeOver1DGangs(      0, m, MC );
  /** Distribute range [0,n) over Comm6th. */
  for ( int jc  = Loop6th.beg();
            jc  < Loop6th.end();
            jc += Loop6th.inc() )
  {
    auto jb = std::min( n - jc, nc );
    /** Distribute range [k_stra,k) over Comm5th. */
    for ( int pc  = Loop5th.beg();
              pc  < Loop5th.end();
              pc += Loop5th.inc() )
    {
      auto pb = std::min( k - pc, KC );
      /** Distribute range [0,jb) over Comm4th. */
      auto LooppkB = Comm4th.DistributeOver1DThreads( 0, jb,      NR );
      auto PackpkB = Comm4th.DistributeOver1DThreads( 0, jb, PACK_NR );
      /** PackB and typecast from TB to TPACKB.  */
      for ( int j  = LooppkB.beg(), jp  = PackpkB.beg();
                j  < LooppkB.end();
                j += LooppkB.inc(), jp += PackpkB.inc() )
      {
        B.Pack( k, pc, pb, n, jc + j, std::min( jb - j, NR ),
            &packB[ jp * pb ] );
      }
      /** Synchronize all threads in Comm4th. */
      Comm4th.Barrier();
      /** Distribute range [0,m) over Comm4th. */
      for ( int ic  = Loop4th.beg();
                ic  < Loop4th.end();
                ic += Loop4th.inc() )
      {
        auto ib = std::min( m - ic, MC );
        /** Distribute range [0,ib) over Comm3th. */
        auto LooppkA = Comm3th.DistributeOver1DThreads( 0, ib, MR );
        auto PackpkA = Comm3th.DistributeOver1DThreads( 0, ib, PACK_MR );
        /** packA and typecast from TA to TPACKA. */
        for ( int i  = LooppkA.beg(), ip  = PackpkA.beg();
                  i  < LooppkA.end();
                  i += LooppkA.inc(), ip += PackpkA.inc() )
        {
          A.Pack( m, ic + i, std::min( ib - i, MR ),
              k, pc, pb, &packA[ ip * pb ] );
        }
        /** Synchronize all threads in Comm3th. */
        Comm3th.Barrier();
        /** Otherwise, invoke the semiubg rank-k kernel. */
        rank_k_macro_kernel<KC>( Comm3th,
          ic, jc, pc, ib, jb, pb, packA, packB,
          V + ic * rs_v + jc * cs_v, rs_v, cs_v,
          semiringkernel );
        /** Synchronize all threads in Comm3th. */
        Comm3th.Barrier();
      } /** end 4th loop */
      Comm4th.Barrier();
    } /** end 5th loop */
    Comm5th.Barrier();
  } /** end 6th loop */
  Comm6th.Barrier();
  /** Free packing buffer. */
  Comm3th.FreeSharedMemory( packA );
  Comm4th.FreeSharedMemory( packB );
}; /** end nbody_internal() */





/**
 *  @breif This is the main routine of gkmx. All packing buffers are
 *         managed here. The communicator and the parallel section
 *         start here.
 *
 */
template<
  int MC, int NC, int KC,
  typename TPACKA, typename TPACKB, typename TV,
  typename     TA, typename     TB, typename TC,
  typename SEMIRINGKERNEL>
void rank_k
(
  int batchId, int m, int n, int k,
  TA& A,
  TB& B,
  TC& C,
  SEMIRINGKERNEL semiringkernel
)
{
  const static int MR         = SEMIRINGKERNEL::mr;
  const static int NR         = SEMIRINGKERNEL::nr;
  const static int PACK_MR    = SEMIRINGKERNEL::pack_mr;
  const static int PACK_NR    = SEMIRINGKERNEL::pack_nr;
  const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
  const static int PACK_MC    = ( MC / MR ) * PACK_MR;
  const static int PACK_NC    = ( NC / NR ) * PACK_NR;
  const static bool USE_STRASSEN = false;

  /** Early return if possible. */
  if ( m == 0 || n == 0 || k == 0 ) return;
  /** Type C must be MatrixLike.  */
  if ( !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
  {
    exit( 1 );
  }
  /** Now get the pointer, row and column stride. */
  auto *V = reinterpret_cast<TV*>( C.X );
  auto rs_v = C.rs;
  auto cs_v = C.cs;


  int k_stra = 0;
  if ( USE_STRASSEN )
  {
    assert( typeid(TPACKA) == typeid(TPACKB) );
    assert( typeid(TC) == typeid(TV) );
    k_stra = k - k % KC;

    if ( k_stra == k ) k_stra -= KC;
  }

  tci::Parallelize( NULL, rank_k_internal<MC, NC, KC, TPACKA, TPACKB, TV,
      TA, TB, TC, SEMIRINGKERNEL>,
      batchId, m, n, k, k_stra, A, B, C, V, rs_v, cs_v,
      semiringkernel );
}; /** end rank_k() */

}; /** end namespace hmlp */

#endif /** define RANK_K_HPP */


Generated by: GCOVR (Version 3.2)

Line	Exec	Source
1		/**
2		* HMLP (High-Performance Machine Learning Primitives)
3		*
4		* Copyright (C) 2014-2017, The University of Texas at Austin
5		*
6		* This program is free software: you can redistribute it and/or modify
7		* it under the terms of the GNU General Public License as published by
8		* the Free Software Foundation, either version 3 of the License, or
9		* (at your option) any later version.
10		*
11		* This program is distributed in the hope that it will be useful,
12		* but WITHOUT ANY WARRANTY; without even the implied warranty of
13		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		* GNU General Public License for more details.
15		*
16		* You should have received a copy of the GNU General Public License
17		* along with this program. If not, see the LICENSE file.
18		*
19		**/
20
21
22
23		#ifndef RANK_K_HPP
24		#define RANK_K_HPP
25
26		#include <assert.h>
27		#include <typeinfo>
28		#include <algorithm>
29
30		#include <hmlp.h>
31		#include <hmlp_internal.hpp>
32		#include <hmlp_base.hpp>
33
34		/** reference microkernels */
35		#include <packing.hpp>
36		#include <semiring_mrxnr.hpp>
37		#include <fused_mrxnr.hpp>
38
39		using namespace std;
40		using namespace hmlp;
41
42		namespace hmlp
43		{
44		/**
45		* @brief Macro kernel contains the 3rd and 2nd loops. Depending on the
46		* configuration of the communicator, the 3rd loop may be parallelized.
47		* b_next is the prefetch pointer.
48		*/
49		template<int KC, typename SEMIRINGKERNEL, typename TA, typename TB, typename TV>
50		void rank_k_macro_kernel
51		(
52		tci::Comm &Comm3rd,
53		int ic, int jc, int pc,
54		int m, int n, int k,
55		TA *packA,
56		TB *packB,
57		TV *V, int rs_v, int cs_v,
58		SEMIRINGKERNEL semiringkernel
59		)
60		{
61		/** Get all block sizes */
62		const static int MR = SEMIRINGKERNEL::mr;
63		const static int NR = SEMIRINGKERNEL::nr;
64		const static int PACK_MR = SEMIRINGKERNEL::pack_mr;
65		const static int PACK_NR = SEMIRINGKERNEL::pack_nr;
66		/** Create subcommunicators for each loop. */
67		auto Comm2nd = Comm3rd.Split( hmlp_read_nway_from_env( "KS_JR_NT" ) );
68		/** Compute loop ranges for each thread */
69		auto Loop3rd = Comm3rd.DistributeOver1DGangs( 0, n, NR );
70		auto Pack3rd = Comm3rd.DistributeOver1DGangs( 0, n, PACK_NR );
71		auto Loop2nd = Comm2nd.DistributeOver1DThreads( 0, m, MR );
72		auto Pack2nd = Comm2nd.DistributeOver1DThreads( 0, m, PACK_MR );
73		/** Distribute range [0,n) over Comm3rd (jr loop). */
74		for ( int j = Loop3rd.beg(), jp = Pack3rd.beg();
75		j < Loop3rd.end();
76		j += Loop3rd.inc(), jp += Pack3rd.inc() )
77		{
78		struct aux_s<TA, TB, TV, TV> aux;
79		aux.pc = pc;
80		aux.b_next = packB;
81		aux.do_packC = 0;
82		aux.jb = std::min( n - j, NR );
83		/** Distribute range [0,m) over Comm2nd (ir loop). */
84		for ( int i = Loop2nd.beg(), ip = Pack2nd.beg();
85		i < Loop2nd.end();
86		i += Loop2nd.inc(), ip += Pack2nd.inc() )
87		{
88		aux.ib = std::min( m - i, MR );
89		/** Increase the b_next pointer. */
90		if ( i + MR >= m ) aux.b_next += Pack3rd.inc() * k;
91
92		if ( aux.jb == NR && aux.ib == MR )
93		{
94		semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
95		&V[ i * rs_v + j * cs_v ], rs_v, cs_v, &aux );
96		}
97		else
98		{
99		TV vtmp[ MR * NR ];
100
101		if ( pc ) // initilize ctmp
102		{
103		for ( auto jj = 0; jj < aux.jb; jj ++ )
104		for ( auto ii = 0; ii < aux.ib; ii ++ )
105		vtmp[ jj * MR + ii ] =
106		V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ];
107		}
108
109		semiringkernel( k, &packA[ ip * k ], &packB[ jp * k ],
110		vtmp, 1, MR, &aux );
111
112		for ( auto jj = 0; jj < aux.jb; jj ++ )
113		for ( auto ii = 0; ii < aux.ib; ii ++ )
114		V[ ( j + jj ) * cs_v + ( i + ii ) * rs_v ]
115		= vtmp[ jj * MR + ii ];
116		}
117		} /** end 2nd loop */
118		} /** end 3rd loop */
119		}; /** end rank_k_macro_kernel() */
120
121
122
123		/**
124		* @breif This function contains the loop body of the 6th to 4th loops,
125		* including all packing and unpacking routines. Notice that this
126		* function is executed by all threads in the root communicator.
127		* To access each thread in different level of communicators, use
128		* their ids.
129		*/
130		template<
131		int MC, int NC, int KC,
132		typename TPACKA, typename TPACKB, typename TV,
133		typename TA, typename TB, typename TC,
134		typename SEMIRINGKERNEL>
135		void rank_k_internal
136		(
137		tci::Comm &Comm6th,
138		int batchId, int m, int n, int k, int k_stra,
139		TA& A,
140		TB& B,
141		TV* V, int rs_v, int cs_v,
142		SEMIRINGKERNEL semiringkernel
143		)
144		{
145		/** Get all block sizes. */
146		const static int MR = SEMIRINGKERNEL::mr;
147		const static int NR = SEMIRINGKERNEL::nr;
148		const static int PACK_MR = SEMIRINGKERNEL::pack_mr;
149		const static int PACK_NR = SEMIRINGKERNEL::pack_nr;
150		const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
151		const static int PACK_MC = ( MC / MR ) * PACK_MR;
152		const static int PACK_NC = ( NC / NR ) * PACK_NR;
153		/** Create subcommunicators for each loop. */
154		auto Comm5th = Comm6th.Split( hmlp_read_nway_from_env( "KS_JC_NT" ) );
155		auto Comm4th = Comm5th.Split( 1 );
156		auto Comm3th = Comm4th.Split( hmlp_read_nway_from_env( "KS_IC_NT" ) );
157		/** Adjuest nc and pack_nc if the 6th loop is parallelized. */
158		int nc = Comm6th.BalanceOver1DGangs( n, NC, NR );
159		int pack_nc = ( nc / NR ) * PACK_NR;
160		/** Allocate packB (shared over Comm4th, private for each Comm5th gang). */
161		auto packB = Comm4th.AllocateSharedMemory<ALIGN_SIZE, TPACKB>( KC ( pack_nc + 1 ) );
162		/** Allocate packA (shared over Comm3th, private for each Comm4th gang). */
163		auto packA = Comm3th.AllocateSharedMemory<ALIGN_SIZE, TPACKA>( KC ( PACK_MC + 1 ) );
164		/** Distribute range [0,n) over Comm6th. */
165		auto Loop6th = Comm6th.DistributeOver1DGangs( 0, n, nc );
166		/** Distribute range [k_stra,k) over Comm5th. */
167		auto Loop5th = Comm5th.DistributeOver1DGangs( k_stra, k, KC );
168		/** Distribute range [0,m) over Comm4th. */
169		auto Loop4th = Comm4th.DistributeOver1DGangs( 0, m, MC );
170		/** Distribute range [0,n) over Comm6th. */
171		for ( int jc = Loop6th.beg();
172		jc < Loop6th.end();
173		jc += Loop6th.inc() )
174		{
175		auto jb = std::min( n - jc, nc );
176		/** Distribute range [k_stra,k) over Comm5th. */
177		for ( int pc = Loop5th.beg();
178		pc < Loop5th.end();
179		pc += Loop5th.inc() )
180		{
181		auto pb = std::min( k - pc, KC );
182		/** Distribute range [0,jb) over Comm4th. */
183		auto LooppkB = Comm4th.DistributeOver1DThreads( 0, jb, NR );
184		auto PackpkB = Comm4th.DistributeOver1DThreads( 0, jb, PACK_NR );
185		/** PackB and typecast from TB to TPACKB. */
186		for ( int j = LooppkB.beg(), jp = PackpkB.beg();
187		j < LooppkB.end();
188		j += LooppkB.inc(), jp += PackpkB.inc() )
189		{
190		B.Pack( k, pc, pb, n, jc + j, std::min( jb - j, NR ),
191		&packB[ jp * pb ] );
192		}
193		/** Synchronize all threads in Comm4th. */
194		Comm4th.Barrier();
195		/** Distribute range [0,m) over Comm4th. */
196		for ( int ic = Loop4th.beg();
197		ic < Loop4th.end();
198		ic += Loop4th.inc() )
199		{
200		auto ib = std::min( m - ic, MC );
201		/** Distribute range [0,ib) over Comm3th. */
202		auto LooppkA = Comm3th.DistributeOver1DThreads( 0, ib, MR );
203		auto PackpkA = Comm3th.DistributeOver1DThreads( 0, ib, PACK_MR );
204		/** packA and typecast from TA to TPACKA. */
205		for ( int i = LooppkA.beg(), ip = PackpkA.beg();
206		i < LooppkA.end();
207		i += LooppkA.inc(), ip += PackpkA.inc() )
208		{
209		A.Pack( m, ic + i, std::min( ib - i, MR ),
210		k, pc, pb, &packA[ ip * pb ] );
211		}
212		/** Synchronize all threads in Comm3th. */
213		Comm3th.Barrier();
214		/** Otherwise, invoke the semiubg rank-k kernel. */
215		rank_k_macro_kernel<KC>( Comm3th,
216		ic, jc, pc, ib, jb, pb, packA, packB,
217		V + ic * rs_v + jc * cs_v, rs_v, cs_v,
218		semiringkernel );
219		/** Synchronize all threads in Comm3th. */
220		Comm3th.Barrier();
221		} /** end 4th loop */
222		Comm4th.Barrier();
223		} /** end 5th loop */
224		Comm5th.Barrier();
225		} /** end 6th loop */
226		Comm6th.Barrier();
227		/** Free packing buffer. */
228		Comm3th.FreeSharedMemory( packA );
229		Comm4th.FreeSharedMemory( packB );
230		}; /** end nbody_internal() */
231
232
233
234
235
236		/**
237		* @breif This is the main routine of gkmx. All packing buffers are
238		* managed here. The communicator and the parallel section
239		* start here.
240		*
241		*/
242		template<
243		int MC, int NC, int KC,
244		typename TPACKA, typename TPACKB, typename TV,
245		typename TA, typename TB, typename TC,
246		typename SEMIRINGKERNEL>
247		void rank_k
248		(
249		int batchId, int m, int n, int k,
250		TA& A,
251		TB& B,
252		TC& C,
253		SEMIRINGKERNEL semiringkernel
254		)
255		{
256		const static int MR = SEMIRINGKERNEL::mr;
257		const static int NR = SEMIRINGKERNEL::nr;
258		const static int PACK_MR = SEMIRINGKERNEL::pack_mr;
259		const static int PACK_NR = SEMIRINGKERNEL::pack_nr;
260		const static int ALIGN_SIZE = SEMIRINGKERNEL::align_size;
261		const static int PACK_MC = ( MC / MR ) * PACK_MR;
262		const static int PACK_NC = ( NC / NR ) * PACK_NR;
263		const static bool USE_STRASSEN = false;
264
265		/** Early return if possible. */
266		if ( m == 0 \|\| n == 0 \|\| k == 0 ) return;
267		/** Type C must be MatrixLike. */
268		if ( !is_same<TC, MatrixLike<PACK_MR, TV, TV>>::value )
269		{
270		exit( 1 );
271		}
272		/** Now get the pointer, row and column stride. */
273		auto V = reinterpret_cast<TV>( C.X );
274		auto rs_v = C.rs;
275		auto cs_v = C.cs;
276
277
278		int k_stra = 0;
279		if ( USE_STRASSEN )
280		{
281		assert( typeid(TPACKA) == typeid(TPACKB) );
282		assert( typeid(TC) == typeid(TV) );
283		k_stra = k - k % KC;
284
285		if ( k_stra == k ) k_stra -= KC;
286		}
287
288		tci::Parallelize( NULL, rank_k_internal<MC, NC, KC, TPACKA, TPACKB, TV,
289		TA, TB, TC, SEMIRINGKERNEL>,
290		batchId, m, n, k, k_stra, A, B, C, V, rs_v, cs_v,
291		semiringkernel );
292		}; /** end rank_k() */
293
294		}; /** end namespace hmlp */
295
296		#endif /** define RANK_K_HPP */