GCC Code Coverage Report
Directory: . Exec Total Coverage
File: frame/containers/MLPGaussNewton.hpp Lines: 0 161 0.0 %
Date: 2019-01-14 Branches: 0 166 0.0 %

Line Exec Source
1
#ifndef MLPGAUSSNEWTON_HPP
2
#define MLPGAUSSNEWTON_HPP
3
4
/** GEMM task support */
5
#include <primitives/gemm.hpp>
6
/** MLPGaussNewton uses VirtualMatrix<T> as base */
7
#include <containers/VirtualMatrix.hpp>
8
/** For GOFMM compatability */
9
#include <containers/SPDMatrix.hpp>
10
11
using namespace std;
12
using namespace hmlp;
13
14
namespace hmlp
15
{
16
17
typedef enum { INPUT, FC, CONV2D, POOLING } LayerType;
18
19
20
template<typename T>
21
class LayerBase
22
{
23
  public:
24
25
    LayerBase( size_t user_N, size_t user_B )
26
    :
27
    N( user_N ), B( user_B )
28
    {};
29
30
    /** Number of neurons */
31
    size_t NeuronSize() { return N; };
32
33
    size_t BatchSize() { return B; };
34
35
    virtual size_t ParameterSize() = 0;
36
37
    /** The input and output types are subjected to change. */
38
    virtual void Forward() = 0;
39
40
    virtual Data<T> & GetValues() = 0;
41
42
  private:
43
44
    size_t N = 0;
45
46
    size_t B = 0;
47
48
};
49
50
template<LayerType LAYERTYPE, typename T>
51
class Layer : public LayerBase<T> {};
52
53
template<typename T>
54
class Layer<INPUT, T> : public LayerBase<T>
55
{
56
  public:
57
58
    Layer( size_t N, size_t B ) : LayerBase<T>( N, B )
59
    {
60
      x.resize( N, B );
61
    };
62
63
    size_t ParameterSize() { return 0; };
64
65
    void Forward() { /** NOP */ };
66
67
68
    void SetValues( const Data<T> &user_x )
69
    {
70
      //assert( user_x.row() == x.row() && user_x.col() == x.col() );
71
      x = user_x;
72
    };
73
74
    Data<T> & GetValues() { return x; };
75
76
  private:
77
78
    Data<T> x;
79
80
};
81
82
83
template<typename T>
84
class Layer<FC, T> : public LayerBase<T>
85
{
86
  public:
87
88
    Layer( size_t N, size_t B, LayerBase<T> &user_prev )
89
    :
90
    LayerBase<T>( N, B ), prev( user_prev )
91
    {
92
      x.resize( N, B );
93
      w.resize( N, prev.NeuronSize() );
94
      /** Initialize w as normal( 0, 0.1 ) */
95
      w.randn( 0.0, 0.1 );
96
      mpi::Bcast( w.data(), w.size(), 0, MPI_COMM_WORLD );
97
    };
98
99
    size_t ParameterSize() { return w.size(); };
100
101
    /** x = w  * prev.x + bias */
102
    void Forward()
103
    {
104
      Data<T> &A = w;
105
      Data<T> &B = prev.GetValues();
106
      Data<T> &C = x;
107
      /** C = AB^{T} */
108
      //gemm::xgemm( HMLP_OP_N, HMLP_OP_N, (T)1.0, A, B, (T)0.0, C );
109
      xgemm( "N", "N", C.row(), C.col(), B.row(),
110
          1.0, A.data(), A.row(),
111
               B.data(), B.row(),
112
          0.0, C.data(), C.row() );
113
      /** RELU activation function */
114
      nnz = 0;
115
      for ( auto &c : C )
116
      {
117
        c = std::max( c, (T)0 );
118
        if ( c ) nnz ++;
119
      }
120
      printf( "Layer report: %8lu/%8lu nnz\n", nnz, C.size() );
121
    };
122
123
124
    /** The tuple contains ( lid in I, idi of W, idj of W ) */
125
    Data<T> PerturbedGradient( bool use_reduced_format, size_t i, size_t j )
126
    {
127
      assert( i < w.row() && j < w.col() );
128
      Data<T> &B = prev.GetValues();
129
      if ( use_reduced_format )
130
      {
131
        Data<T> G( 1, this->BatchSize(), 0 );
132
        for ( size_t b = 0; b < this->BatchSize(); b ++ )
133
        {
134
          if ( x( i, b ) ) G[ b ] = B( j, b );
135
        }
136
        return G;
137
      }
138
      else
139
      {
140
        Data<T> G( this->NeuronSize(), this->BatchSize(), 0 );
141
        for ( size_t b = 0; b < this->BatchSize(); b ++ )
142
        {
143
          if ( x( i, b ) ) G( i, b ) = B( j, b );
144
        }
145
        return G;
146
      }
147
    };
148
149
    Data<T> Gradient( Data<T> &B )
150
    {
151
      Data<T> &A = w;
152
      Data<T> G( this->NeuronSize(), this->BatchSize(), 0 );
153
      //gemm::xgemm( HMLP_OP_N, HMLP_OP_N, (T)1.0, A, B, (T)0.0, G );
154
      xgemm( "N", "N", G.row(), G.col(), B.row(),
155
          1.0, A.data(), A.row(),
156
               B.data(), B.row(),
157
          0.0, G.data(), G.row() );
158
      /** RELU gradient */
159
      for ( size_t i = 0; i < x.size(); i ++ )
160
      {
161
        if ( !x[ i ] ) G[ i ] = 0;
162
      }
163
      return G;
164
    };
165
166
    Data<T> Gradient( Data<T> &B, size_t q )
167
    {
168
      Data<T> &A = w;
169
      Data<T> G( this->NeuronSize(), this->BatchSize(), 0 );
170
      /** RELU gradient */
171
      for ( size_t j = 0; j < x.col(); j ++ )
172
      {
173
        for ( size_t i = 0; i < x.row(); i ++ )
174
        {
175
          if ( x( i, j ) ) G( i, j ) = w( i, q ) * B[ j ];
176
        }
177
      }
178
      return G;
179
    };
180
181
    Data<T> & GetValues() { return x; };
182
183
    Data<T> & GetParameters() { return w; };
184
185
    size_t NumberNonZeros() { return nnz; };
186
187
  private:
188
189
    /** N-by-B */
190
    Data<T> x;
191
192
    /** N-by-prev.N */
193
    Data<T> w;
194
195
    Data<T> bias;
196
197
    LayerBase<T> &prev;
198
199
    size_t nnz = 0;
200
201
};
202
203
204
template<typename T>
205
class MLPGaussNewton : public VirtualMatrix<T>,
206
                       public SPDMatrixMPISupport<T>
207
{
208
  public:
209
210
    MLPGaussNewton()
211
    {
212
      filename = string( "/scratch/02794/ych/data/tmp/MLPGaussNewton.bin" );
213
    };
214
215
    void AppendInputLayer( Layer<INPUT, T> &layer )
216
    {
217
      this->in = &layer;
218
    }
219
220
    void AppendFCLayer( Layer<FC, T> &layer )
221
    {
222
      size_t N = this->row() + layer.ParameterSize();
223
      this->resize( N, N );
224
      net.push_back( &layer );
225
      index_range.push_back( N );
226
      printf( "Layer %3lu N %8lu\n", net.size(), N ); fflush( stdout );
227
    };
228
229
    void Update( const Data<T> &data )
230
    {
231
      int comm_rank; mpi::Comm_rank( MPI_COMM_WORLD, &comm_rank );
232
      int comm_size; mpi::Comm_size( MPI_COMM_WORLD, &comm_size );
233
234
      in->SetValues( data );
235
      /** Feed forward */
236
      for ( auto layer : net ) layer->Forward();
237
238
      //fd = open( filename.data(), O_RDWR | O_CREAT, 0 );
239
      //assert( fd != -1 );
240
241
      //uint64_t data_size = this->col();
242
      //data_size *= in->BatchSize();
243
      //data_size *= sizeof(T);
244
245
      //if ( comm_rank == 0 )
246
      //{
247
      //  if ( lseek( fd, data_size - 1, SEEK_SET ) == -1 ) printf( "lseek failure\n" );
248
      //  if ( write( fd, "", 1 ) == -1 ) printf( "write failure\n" );
249
      //}
250
      //mpi::Barrier( MPI_COMM_WORLD );
251
252
      //mmappedData = (T*)mmap(0, data_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 );
253
      //if ( mmappedData == MAP_FAILED ) printf( "mmap failure\n" );
254
255
      //#pragma omp parallel for
256
      //for ( size_t j = comm_rank; j < this->col(); j += comm_size )
257
      //{
258
      //  Data<T> Jj = Jacobian( vector<size_t>( 1, j ) );
259
      //  uint64_t offj = (uint64_t)Jj.size() * (uint64_t)j;
260
      //  for ( size_t i = 0; i < Jj.size(); i ++ ) mmappedData[ offj + i ] = Jj[ i ];
261
      //}
262
      //mpi::Barrier( MPI_COMM_WORLD );
263
    };
264
265
    void WriteJacobianToFiles( string filename )
266
    {
267
      int comm_rank; mpi::Comm_rank( MPI_COMM_WORLD, &comm_rank );
268
      int comm_size; mpi::Comm_size( MPI_COMM_WORLD, &comm_size );
269
270
      uint64_t nb = 2048;
271
      uint64_t nf = ( net.back()->NumberNonZeros() - 1 ) / nb + 1;
272
      uint64_t data_size = (uint64_t)this->col() * nb * sizeof(float);
273
274
      vector<int> fd( nf );
275
      vector<float*> mappedData( nf );
276
277
      if ( comm_rank == 0 )
278
      {
279
        for ( size_t f = 0; f < nf; f ++ )
280
        {
281
          string filepath = filename + to_string( f * nb );
282
          fd[ f ] = open( filepath.data(), O_RDWR | O_CREAT, 0 );
283
          assert( fd[ f ] != -1 );
284
        }
285
      }
286
      mpi::Barrier( MPI_COMM_WORLD );
287
      if ( comm_rank > 0 )
288
      {
289
        for ( size_t f = 0; f < nf; f ++ )
290
        {
291
          string filepath = filename + to_string( f * nb );
292
          fd[ f ] = open( filepath.data(), O_RDWR, 0 );
293
          assert( fd[ f ] != -1 );
294
        }
295
      }
296
      if ( comm_rank == 0 )
297
      {
298
        for ( size_t f = 0; f < nf; f ++ )
299
        {
300
          if ( lseek( fd[ f ], data_size - 1, SEEK_SET ) == -1 ) printf( "lseek failure\n" );
301
          if ( write( fd[ f ], "", 1 ) == -1 ) printf( "write failure\n" );
302
        }
303
      }
304
      mpi::Barrier( MPI_COMM_WORLD );
305
      for ( size_t f = 0; f < nf; f ++ )
306
      {
307
        mappedData[ f ] = (float*)mmap( 0, data_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd[ f ], 0 );
308
        if ( mappedData[ f ] == MAP_FAILED ) printf( "mmap failure\n" );
309
      }
310
      mpi::Barrier( MPI_COMM_WORLD );
311
312
      size_t zero_columns = 0;
313
314
      #pragma omp parallel for schedule(dynamic)
315
      for ( size_t j = comm_rank; j < this->col(); j += comm_size )
316
      {
317
        if ( j % 5000 == 0 ) printf( "%8lu columns computed (rank %3d)\n", j, comm_rank );
318
        Data<T> Jj = Jacobian( vector<size_t>( 1, j ) );
319
        uint64_t offj = (uint64_t)nb * (uint64_t)j;
320
        size_t nnz = 0;
321
        for ( size_t i = 0; i < Jj.size(); i ++ )
322
        {
323
          if ( i < Jj.size() )
324
          {
325
            mappedData[ i / nb ][ offj + ( i % nb ) ] = (float)Jj[ i ];
326
            if ( Jj[ i ] ) nnz ++;
327
          }
328
          else
329
          {
330
            mappedData[ i / nb ][ offj + ( i % nb ) ] = 0;
331
          }
332
        }
333
        if ( !nnz )
334
        {
335
          #pragma omp atomic update
336
          zero_columns ++;
337
        }
338
      }
339
      mpi::Barrier( MPI_COMM_WORLD );
340
      printf( "zero column %lu\n", zero_columns );
341
342
      for ( size_t f = 0; f < nf; f ++ )
343
      {
344
        int rc = munmap( mappedData[ f ], data_size );
345
        assert( rc == 0 );
346
        close( fd[ f ] );
347
      }
348
      mpi::Barrier( MPI_COMM_WORLD );
349
    };
350
351
352
353
    /** ESSENTIAL: this is an abstract function  */
354
    virtual T operator()( size_t i, size_t j )
355
    {
356
      Data<T> KIJ = (*this)( vector<size_t>( 1, i ), vector<size_t>( 1, j ) );
357
      return KIJ[ 0 ];
358
    };
359
360
    /** ESSENTIAL: return a submatrix */
361
    virtual Data<T> operator()
362
		(
363
      const vector<size_t> &I, const vector<size_t> &J
364
    )
365
    {
366
      Data<T> KIJ( I.size(), J.size() );
367
      Data<T> A = Jacobian( I );
368
      Data<T> B = Jacobian( J );
369
370
      //size_t d = net.back()->NeuronSize() * net.back()->BatchSize();
371
      //Data<T> A( d, I.size() );
372
      //Data<T> B( d, J.size() );
373
      //for ( size_t j = 0; j < I.size(); j ++ )
374
      //{
375
      //  uint64_t offj = (uint64_t)d * (uint64_t)j;
376
      //  for ( size_t i = 0; i < d; i ++ ) A( i, j ) = mmappedData[ offj + i ];
377
      //}
378
      //for ( size_t j = 0; j < J.size(); j ++ )
379
      //{
380
      //  uint64_t offj = (uint64_t)d * (uint64_t)j;
381
      //  for ( size_t i = 0; i < d; i ++ ) B( i, j ) = mmappedData[ offj + i ];
382
      //}
383
384
      /** KIJ = A^{T}B */
385
      xgemm( "T", "N", KIJ.row(), KIJ.col(), B.row(),
386
          1.0, A.data(), A.row(),
387
               B.data(), B.row(),
388
          0.0, KIJ.data(), KIJ.row() );
389
      return KIJ;
390
    };
391
392
393
  private:
394
395
    Layer<INPUT, T> *in = NULL;
396
397
    /** [ L0, L1, ..., Lq-1 ], q layers */
398
    vector<Layer<FC, T>*> net;
399
400
    /** [ #W1, #W1+#W1, ..., #W1+...+#Wq-1], sorted q numbers */
401
    vector<size_t> index_range;
402
403
    Data<T> Jcache;
404
405
    vector<size_t> all_rows;
406
407
    string filename;
408
409
    /** Use mmap */
410
    T *mmappedData = NULL;
411
412
    int fd = -1;
413
414
415
416
    /** [ RELU(Wq-1)*...*RELU(W1), ..., RELU(Wq-1), I ], q products */
417
    //vector<Data<T>> product;
418
419
    /** Convert global parameter index to local layer index. */
420
    //vector<vector<tuple<size_t, size_t, size_t>>> Index2Layer( const vector<size_t> &I )
421
    //{
422
    //  size_t n_layer = net.size();
423
    //  vector<vector<tuple<size_t, size_t, size_t>>> ret( n_layer );
424
425
    //  for ( size_t i = 0; i < I.size(); i ++ )
426
    //  {
427
    //    auto upper = upper_bound( index_range.begin(), index_range.end(), I[ i ] );
428
    //    assert( upper != index_range.end() );
429
    //    size_t layer = distance( index_range.begin(), upper );
430
431
    //    /** Compute local index within the layer */
432
    //    size_t lid = I[ i ];
433
    //    if ( layer ) lid -= index_range[ layer - 1 ];
434
    //    Data<T> &w = net[ layer ]->GetParameters();
435
    //    size_t lidi = lid % w.row();
436
    //    size_t lidj = lid / w.row();
437
    //    ret[ layer ].push_back( make_tuple( i, lidi, lidj ) );
438
    //  }
439
440
    //  return ret;
441
    //};
442
443
    vector<tuple<size_t, size_t, size_t>> Index2Layer( const vector<size_t> &I )
444
    {
445
      vector<tuple<size_t, size_t, size_t>> ret( I.size() );
446
      for ( size_t i = 0; i < I.size(); i ++ )
447
      {
448
        auto upper = upper_bound( index_range.begin(), index_range.end(), I[ i ] );
449
        assert( upper != index_range.end() );
450
        size_t layer = distance( index_range.begin(), upper );
451
        /** Compute local index within the layer */
452
        size_t lid = I[ i ];
453
        if ( layer ) lid -= index_range[ layer - 1 ];
454
        Data<T> &w = net[ layer ]->GetParameters();
455
        size_t lidi = lid % w.row();
456
        size_t lidj = lid / w.row();
457
        ret[ i ] = make_tuple( layer, lidi, lidj );
458
      }
459
      return ret;
460
    };
461
462
    Data<T> Jacobian( const vector<size_t> &I )
463
    {
464
      size_t B = net.back()->BatchSize();
465
      size_t N = net.back()->NeuronSize();
466
      size_t nnz = net.back()->NumberNonZeros();
467
468
      //Data<T> J( N * B, I.size(), 0 );
469
      Data<T> J( nnz, I.size(), 0 );
470
471
      /** Iatl[ q ] contains all indices of layer q. */
472
      auto Iatl = Index2Layer( I );
473
      //printf( "Index2Layer\n" ); fflush( stdout );
474
475
      //#pragma omp parallel for
476
      for ( size_t b = 0; b < Iatl.size(); b ++ )
477
      {
478
        size_t l = get<0>( Iatl[ b ] );
479
        size_t i = get<1>( Iatl[ b ] );
480
        size_t j = get<2>( Iatl[ b ] );
481
482
        Data<T> Jbuff, tmp;
483
484
        if ( l == net.size() - 1 )
485
        {
486
          Jbuff = net[ l ]->PerturbedGradient( false, i, j );
487
        }
488
        else
489
        {
490
          Jbuff = net[ l + 0 ]->PerturbedGradient(  true, i, j );
491
          tmp   = net[ l + 1 ]->Gradient( Jbuff, i );
492
          Jbuff = tmp;
493
          for ( size_t layer = l + 2; layer < net.size(); layer ++ )
494
          {
495
            Data<T> tmp = net[ layer ]->Gradient( Jbuff );
496
            Jbuff = tmp;
497
          }
498
        }
499
500
        //for ( size_t layer = l + 1; layer < net.size(); layer ++ )
501
        //{
502
        //  Data<T> tmp = net[ layer ]->Gradient( Jbuff );
503
        //  Jbuff = tmp;
504
        //}
505
506
        assert( Jbuff.size() == N * B );
507
        size_t count = 0;
508
        for ( size_t q = 0; q < N * B; q ++ )
509
        {
510
          auto &lastx = net.back()->GetValues();
511
          if ( lastx[ q ] )
512
          {
513
            J( count, b ) = Jbuff[ q ];
514
            count ++;
515
          }
516
        }
517
        assert( count == nnz );
518
      }
519
520
      ///** Compute Jacobian perburbation for each layer. */
521
      //for ( size_t q = 0; q < Iatl.size(); q ++ )
522
      //{
523
      //  /** B-by-Iatl[ q ].size() */
524
      //  Data<T> JBq = net[ q ]->ForwardPerturbation( Iatl[ q ] );
525
      //  /** Compute the rest of the feed forward network. */
526
      //  for ( size_t j = 0; j < Iatl[ q ].size(); j ++ )
527
      //  {
528
      //    size_t lid  = get<0>( Iatl[ q ][ j ] );
529
      //    size_t lidi = get<1>( Iatl[ q ][ j ] );
530
      //    for ( size_t b = 0; b < B; b ++ ) JB( b, lid ) = JBq( b, j );
531
      //    for ( size_t i = 0; i < N; i ++ ) JN( i, lid ) = product[ q ]( i, lidi );
532
      //  }
533
      //}
534
535
      ///** Severin: compute J from JN and JB */
536
537
      return J;
538
    }
539
540
541
542
543
544
    /** JN is LastLayerNeuronSize-by-I.size(), JB is BatchSize-by-I.size() */
545
    //void Jacobian( const vector<size_t> &I, Data<T> &JN, Data<T> &JB )
546
    //{
547
    //  size_t B = net.back()->BatchSize();
548
    //  size_t N = net.back()->NeuronSize();
549
550
    //  JN.resize( N, I.size(), 0 );
551
    //  JB.resize( B, I.size(), 0 );
552
553
    //  /** */
554
    //  auto Iatl = Index2Layer( I );
555
556
    //  /** Compute Jacobian perburbation for each layer. */
557
    //  for ( size_t l = 0; l < Iatl.size(); l ++ )
558
    //  {
559
    //    Data<T> JBl = net[ l ]->ForwardPerturbation( Iatl[ l ] );
560
    //    /** Compute the rest of the feed forward network. */
561
    //    for ( size_t j = 0; j < Iatl[ l ].size(); j ++ )
562
    //    {
563
    //      size_t lid  = get<0>( Iatl[ l ][ j ] );
564
    //      size_t lidi = get<1>( Iatl[ l ][ j ] );
565
    //      for ( size_t b = 0; b < B; b ++ ) JB( b, lid ) = JBl( b, j );
566
    //      for ( size_t i = 0; i < N; i ++ ) JN( i, lid ) = product[ l ]( i, lidi );
567
    //    }
568
    //
569
    //  }
570
    //};
571
572
573
574
575
};
576
577
578
579
580
581
}; /** end namespace hmlp */
582
583
#endif /** define MLPGAUSSNEWTON_HPP */