MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_PerfModels_def.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// MueLu: A package for multigrid based preconditioning
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact
39// Jonathan Hu (jhu@sandia.gov)
40// Andrey Prokopenko (aprokop@sandia.gov)
41// Ray Tuminaro (rstumin@sandia.gov)
42//
43// ***********************************************************************
44//
45// @HEADER
46#include <cstdio>
47#include <cmath>
48#include <numeric>
49#include <utility>
50#include <chrono>
51#include <iomanip>
52#include <Teuchos_ScalarTraits.hpp>
53#include <Kokkos_ArithTraits.hpp>
55
56
57#ifdef HAVE_MPI
58#include <mpi.h>
59#endif
60
61namespace MueLu {
62
63 namespace PerfDetails {
64 template<class Scalar,class Node>
65 double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE) {
66 // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
67 using impl_scalar_type = typename Kokkos::Details::ArithTraits<Scalar>::val_type;
68
69 using exec_space = typename Node::execution_space;
70 using memory_space = typename Node::memory_space;
71 using range_policy = Kokkos::RangePolicy<exec_space>;
72
73 Kokkos::View<impl_scalar_type*,memory_space> a("a", VECTOR_SIZE);
74 Kokkos::View<impl_scalar_type*,memory_space> b("b", VECTOR_SIZE);
75 Kokkos::View<impl_scalar_type*,memory_space> c("c", VECTOR_SIZE);
76 double total_test_time = 0.0;
77
78 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
79
80 Kokkos::parallel_for("stream/fill",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (const size_t i) {
81 a(i) = ONE * (double)i;
82 b(i) = a(i);
83 });
84 exec_space().fence();
85
86 using clock = std::chrono::high_resolution_clock;
87
88 clock::time_point start, stop;
89
90 for(int i = 0; i < KERNEL_REPEATS; i++) {
91 start = clock::now();
92 Kokkos::parallel_for("stream/add",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (const size_t j) { //Vector Addition
93 c(j) = a(j) + b(j);
94 });
95
96 exec_space().fence();
97 stop = clock::now();
98 double my_test_time = std::chrono::duration<double>(stop - start).count();
99 total_test_time += my_test_time;
100 }
101
102 return total_test_time / KERNEL_REPEATS;
103 }
104
105 template<class Scalar,class Node>
106 double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE) {
107 // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
108 using impl_scalar_type = typename Kokkos::Details::ArithTraits<Scalar>::val_type;
109
110 using exec_space = typename Node::execution_space;
111 using memory_space = typename Node::memory_space;
112 using range_policy = Kokkos::RangePolicy<exec_space>;
113
114 Kokkos::View<impl_scalar_type*,memory_space> a("a", VECTOR_SIZE);
115 Kokkos::View<impl_scalar_type*,memory_space> b("b", VECTOR_SIZE);
116 double total_test_time = 0.0;
117
118 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
119
120 Kokkos::parallel_for("stream/fill",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (const size_t i) {
121 a(i) = ONE;
122 });
123 exec_space().fence();
124
125 using clock = std::chrono::high_resolution_clock;
126 clock::time_point start, stop;
127
128 for(int i = 0; i < KERNEL_REPEATS; i++) {
129 start = clock::now();
130 Kokkos::parallel_for("stream/copy",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (const size_t j) { //Vector Addition
131 b(j) = a(j);
132 });
133
134 exec_space().fence();
135 stop = clock::now();
136 double my_test_time = std::chrono::duration<double>(stop - start).count();
137 total_test_time += my_test_time;
138 }
139
140 return total_test_time / KERNEL_REPEATS;
141 }
142
143
144
145 double table_lookup(const std::vector<int> & x, const std::vector<double> & y, int value) {
146 // If there's no table, nan
147 if(x.size() == 0) return Teuchos::ScalarTraits<double>::nan();
148
149 // NOTE: This should probably be a binary search, but this isn't performance sensitive, so we'll go simple
150 int N = (int) x.size();
151 int hi = 0;
152 for( ; hi < N; hi++) {
153 if (x[hi] > value)
154 break;
155 }
156
157 if(hi == 0) {
158 // Lower end (return the min time)
159 //printf("Lower end: %d < %d\n",value,x[0]);
160 return y[0];
161 }
162 else if (hi == N) {
163 // Higher end (extrapolate from the last two points)
164 //printf("Upper end: %d > %d\n",value,x[N-1]);
165 hi = N-1;
166 int run = x[hi] - x[hi-1];
167 double rise = y[hi] - y[hi-1];
168 double slope = rise / run;
169 int diff = value - x[hi-1];
170
171 return y[hi-1] + slope * diff;
172 }
173 else {
174 // Interpolate
175 //printf("Middle: %d < %d < %d\n",x[hi-1],value,x[hi]);
176 int run = x[hi] - x[hi-1];
177 double rise = y[hi] - y[hi-1];
178 double slope = rise / run;
179 int diff = value - x[hi-1];
180
181 return y[hi-1] + slope * diff;
182 }
183 }
184
185 // Report bandwidth in GB / sec
186 const double GB = 1024.0 * 1024.0 * 1024.0;
187 double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes) {
188 double time_per_call = time / num_calls;
189 return memory_per_call_bytes / GB / time_per_call;
190 }
191
192
193 template <class exec_space, class memory_space>
194 void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE,const Teuchos::Comm<int> &comm, std::vector<int> & sizes, std::vector<double> & times) {
195 int rank = comm.getRank();
196 int nproc = comm.getSize();
197
198 if(nproc < 2) return;
199
200#ifdef HAVE_MPI
201 using range_policy = Kokkos::RangePolicy<exec_space>;
202 const int buff_size = (int) pow(2,MAX_SIZE);
203
204 sizes.resize(MAX_SIZE+1);
205 times.resize(MAX_SIZE+1);
206
207 // Allocate memory for the buffers (and fill send)
208 Kokkos::View<char*,memory_space> r_buf("recv",buff_size), s_buf("send",buff_size);
209 Kokkos::deep_copy(s_buf,1);
210
211 //Send and recieve.
212 // NOTE: Do consectutive pair buddies here for simplicity. We should be smart later
213 int odd = rank % 2;
214 int buddy = odd ? rank - 1 : rank + 1;
215
216 for(int i = 0; i < MAX_SIZE + 1 ;i ++) {
217 int msg_size = (int) pow(2,i);
218 comm.barrier();
219
220 double t0 = MPI_Wtime();
221 for(int j = 0; j < KERNEL_REPEATS; j++) {
222 if (buddy < nproc) {
223 if (odd) {
224 comm.send(msg_size, (char*)s_buf.data(), buddy);
225 comm.receive(buddy, msg_size, (char*)r_buf.data());
226 }
227 else {
228 comm.receive(buddy, msg_size,(char*)r_buf.data());
229 comm.send(msg_size, (char*)s_buf.data(), buddy);
230 }
231 }
232 }
233
234 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
235 sizes[i] = msg_size;
236 times[i] = time_per_call;
237 }
238#endif
239 }
240
241
242
243 }// end namespace PerfDetails
244
245
246 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
248
249
250 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
251 void
253
254 // We need launch/waits latency estimates for corrected stream
255 launch_latency_make_table(KERNEL_REPEATS);
256 double latency = launch_latency_lookup();
257
258 if(LOG_MAX_SIZE < 2)
259 LOG_MAX_SIZE=20;
260
261 stream_sizes_.resize(LOG_MAX_SIZE+1);
262 stream_copy_times_.resize(LOG_MAX_SIZE+1);
263 stream_add_times_.resize(LOG_MAX_SIZE+1);
264 latency_corrected_stream_copy_times_.resize(LOG_MAX_SIZE+1);
265 latency_corrected_stream_add_times_.resize(LOG_MAX_SIZE+1);
266
267 for(int i=0; i<LOG_MAX_SIZE+1; i++) {
268 int size = (int) pow(2,i);
269 double c_time = PerfDetails::stream_vector_copy<Scalar,Node>(KERNEL_REPEATS,size);
270 double a_time = PerfDetails::stream_vector_add<Scalar,Node>(KERNEL_REPEATS,size);
271
272 stream_sizes_[i] = size;
273
274 // Correct for the difference in memory transactions per element
275 stream_copy_times_[i] = c_time / 2.0;
276 stream_add_times_[i] = a_time / 3.0;
277
278 // Correct for launch latency too. We'll note that sometimes the latency estimate
279 // is higher than the actual copy/add time estimate. If so, we don't correct
280 latency_corrected_stream_copy_times_[i] = (c_time - latency <= 0.0) ? c_time / 2.0 : ( (c_time-latency)/2.0 );
281 latency_corrected_stream_add_times_[i] = (a_time - latency <= 0.0) ? a_time / 3.0 : ( (a_time-latency)/3.0 );
282
283
284 }
285 }
286
287 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
288 double
290 return PerfDetails::table_lookup(stream_sizes_,stream_copy_times_,SIZE_IN_BYTES/sizeof(Scalar));
291 }
292
293 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
294 double
296 return PerfDetails::table_lookup(stream_sizes_,stream_add_times_,SIZE_IN_BYTES/sizeof(Scalar));
297 }
298
299 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
300 double
302 return std::min(stream_vector_copy_lookup(SIZE_IN_BYTES),stream_vector_add_lookup(SIZE_IN_BYTES));
303 }
304
305
306 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
307 double
309 return PerfDetails::table_lookup(stream_sizes_,latency_corrected_stream_copy_times_,SIZE_IN_BYTES/sizeof(Scalar));
310 }
311
312 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
313 double
315 return PerfDetails::table_lookup(stream_sizes_,latency_corrected_stream_add_times_,SIZE_IN_BYTES/sizeof(Scalar));
316 }
317
318 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
319 double
321 return std::min(latency_corrected_stream_vector_copy_lookup(SIZE_IN_BYTES),latency_corrected_stream_vector_add_lookup(SIZE_IN_BYTES));
322 }
323
324
325 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
326 void
328 print_stream_vector_table_impl(out,false);
329 }
330
331 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
332 void
334 print_stream_vector_table_impl(out,true);
335 }
336
337
338 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
339 void
341 using namespace std;
342 std::ios old_format(NULL);
343 old_format.copyfmt(out);
344
345 out << setw(20) << "Length in Scalars" << setw(1) << " "
346 << setw(20) << "COPY (us)" << setw(1) << " "
347 << setw(20) << "ADD (us)" << setw(1) << " "
348 << setw(20) << "COPY (GB/s)" << setw(1) << " "
349 << setw(20) << "ADD (GB/s)" << std::endl;
350
351 out << setw(20) << "-----------------" << setw(1) << " "
352 << setw(20) << "---------" << setw(1) << " "
353 << setw(20) << "--------" << setw(1) << " "
354 << setw(20) << "-----------" << setw(1) << " "
355 << setw(20) << "----------" << std::endl;
356
357
358 for(int i=0; i<(int)stream_sizes_.size(); i++) {
359 int size = stream_sizes_[i];
360 double c_time = use_latency_correction ? latency_corrected_stream_copy_times_[i] : stream_copy_times_[i];
361 double a_time = use_latency_correction ? latency_corrected_stream_add_times_[i] : stream_add_times_[i];
362 // We've already corrected for the transactions per element difference
363 double c_bw = PerfDetails::convert_time_to_bandwidth_gbs(c_time,1,size*sizeof(Scalar));
364 double a_bw = PerfDetails::convert_time_to_bandwidth_gbs(a_time,1,size*sizeof(Scalar));
365
366
367 out << setw(20) << size << setw(1) << " "
368 << setw(20) << fixed << setprecision(4) << (c_time*1e6) << setw(1) << " "
369 << setw(20) << fixed << setprecision(4) << (a_time*1e6) << setw(1) << " "
370 << setw(20) << fixed << setprecision(4) << c_bw << setw(1) << " "
371 << setw(20) << fixed << setprecision(4) << a_bw << std::endl;
372 }
373
374 out.copyfmt(old_format);
375 }
376
377
378
379
380 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
381 void
382 PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm) {
383
384 PerfDetails::pingpong_basic<Kokkos::HostSpace::execution_space,Kokkos::HostSpace::memory_space>(KERNEL_REPEATS,LOG_MAX_SIZE,*comm,pingpong_sizes_,pingpong_host_times_);
385
386 PerfDetails::pingpong_basic<typename Node::execution_space,typename Node::memory_space>(KERNEL_REPEATS,LOG_MAX_SIZE,*comm,pingpong_sizes_,pingpong_device_times_);
387
388 }
389
390 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
391 double
393 return PerfDetails::table_lookup(pingpong_sizes_,pingpong_host_times_,SIZE_IN_BYTES);
394 }
395
396 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
397 double
399 return PerfDetails::table_lookup(pingpong_sizes_,pingpong_device_times_,SIZE_IN_BYTES);
400 }
401
402
403 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
404 void
406 if(pingpong_sizes_.size() == 0) return;
407
408 using namespace std;
409 std::ios old_format(NULL);
410 old_format.copyfmt(out);
411
412 out << setw(20) << "Message Size" << setw(1) << " "
413 << setw(20) << "Host (us)" << setw(1) << " "
414 << setw(20) << "Device (us)" << std::endl;
415
416 out << setw(20) << "------------" << setw(1) << " "
417 << setw(20) << "---------" << setw(1) << " "
418 << setw(20) << "-----------" << std::endl;
419
420
421 for(int i=0; i<(int)pingpong_sizes_.size(); i++) {
422 int size = pingpong_sizes_[i];
423 double h_time = pingpong_host_times_[i];
424 double d_time = pingpong_device_times_[i];
425
426
427 out << setw(20) << size << setw(1) << " "
428 << setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) << " "
429 << setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl;
430 }
431
432 out.copyfmt(old_format);
433 }
434
435 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
436 void
438 using exec_space = typename Node::execution_space;
439 using range_policy = Kokkos::RangePolicy<exec_space>;
440 using clock = std::chrono::high_resolution_clock;
441
442 double total_test_time = 0;
443 clock::time_point start, stop;
444 for(int i = 0; i < KERNEL_REPEATS; i++) {
445 start = clock::now();
446 Kokkos::parallel_for("empty kernel",range_policy(0,1), KOKKOS_LAMBDA (const size_t j) {
447 ;
448 });
449 exec_space().fence();
450 stop = clock::now();
451 double my_test_time = std::chrono::duration<double>(stop - start).count();
452 total_test_time += my_test_time;
453 }
454
455 launch_and_wait_latency_ = total_test_time / KERNEL_REPEATS;
456 }
457
458 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
459 double
461 return launch_and_wait_latency_;
462 }
463
464
465 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
466 void
468 using namespace std;
469 std::ios old_format(NULL);
470 old_format.copyfmt(out);
471
472 out << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " "
473 << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_*1e6) << std::endl;
474
475 out.copyfmt(old_format);
476 }
477
478
479} //namespace MueLu
MueLu::DefaultScalar Scalar
void print_pingpong_table(std::ostream &out)
void print_latency_corrected_stream_vector_table(std::ostream &out)
void print_stream_vector_table(std::ostream &out)
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction)
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
double pingpong_device_lookup(int SIZE_IN_BYTES)
double pingpong_host_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double stream_vector_add_lookup(int SIZE_IN_BYTES)
double stream_vector_lookup(int SIZE_IN_BYTES)
void launch_latency_make_table(int KERNEL_REPEATS)
void print_launch_latency_table(std::ostream &out)
double table_lookup(const std::vector< int > &x, const std::vector< double > &y, int value)
double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE)
void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm< int > &comm, std::vector< int > &sizes, std::vector< double > &times)
double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes)
double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE)
Namespace for MueLu classes and methods.