40#ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
41#define TPETRA_BLOCKCRSMATRIX_DEF_HPP
49#include "Tpetra_BlockMultiVector.hpp"
52#include "Teuchos_TimeMonitor.hpp"
53#ifdef HAVE_TPETRA_DEBUG
69#if defined(__CUDACC__)
71# if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
72# undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
75#elif defined(__GNUC__)
77# if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
78# undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
84# if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
85# define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
95 struct BlockCrsRowStruct {
96 T totalNumEntries, totalNumBytes, maxRowLength;
97 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() =
default;
98 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(
const BlockCrsRowStruct &b) =
default;
99 KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(
const T& numEnt,
const T& numBytes,
const T& rowLength)
100 : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
105 KOKKOS_INLINE_FUNCTION
106 void operator+=(BlockCrsRowStruct<T> &a,
const BlockCrsRowStruct<T> &b) {
107 a.totalNumEntries += b.totalNumEntries;
108 a.totalNumBytes += b.totalNumBytes;
109 a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
112 template<
typename T,
typename ExecSpace>
113 struct BlockCrsReducer {
114 typedef BlockCrsReducer reducer;
115 typedef T value_type;
116 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
119 KOKKOS_INLINE_FUNCTION
120 BlockCrsReducer(value_type &val) : value(&val) {}
122 KOKKOS_INLINE_FUNCTION
void join(value_type &dst, value_type &src)
const { dst += src; }
123 KOKKOS_INLINE_FUNCTION
void join(value_type &dst,
const value_type &src)
const { dst += src; }
124 KOKKOS_INLINE_FUNCTION
void init(value_type &val)
const { val = value_type(); }
125 KOKKOS_INLINE_FUNCTION value_type& reference() {
return *value; }
126 KOKKOS_INLINE_FUNCTION result_view_type view()
const {
return result_view_type(value); }
129 template<
class AlphaCoeffType,
131 class MatrixValuesType,
136 class BcrsApplyNoTransFunctor {
138 static_assert (Kokkos::is_view<MatrixValuesType>::value,
139 "MatrixValuesType must be a Kokkos::View.");
140 static_assert (Kokkos::is_view<OutVecType>::value,
141 "OutVecType must be a Kokkos::View.");
142 static_assert (Kokkos::is_view<InVecType>::value,
143 "InVecType must be a Kokkos::View.");
144 static_assert (std::is_same<MatrixValuesType,
145 typename MatrixValuesType::const_type>::value,
146 "MatrixValuesType must be a const Kokkos::View.");
147 static_assert (std::is_same<
typename OutVecType::value_type,
148 typename OutVecType::non_const_value_type>::value,
149 "OutVecType must be a nonconst Kokkos::View.");
150 static_assert (std::is_same<
typename InVecType::value_type,
151 typename InVecType::const_value_type>::value,
152 "InVecType must be a const Kokkos::View.");
153 static_assert (
static_cast<int> (MatrixValuesType::rank) == 1,
154 "MatrixValuesType must be a rank-1 Kokkos::View.");
155 static_assert (
static_cast<int> (InVecType::rank) == 1,
156 "InVecType must be a rank-1 Kokkos::View.");
157 static_assert (
static_cast<int> (OutVecType::rank) == 1,
158 "OutVecType must be a rank-1 Kokkos::View.");
159 typedef typename MatrixValuesType::non_const_value_type scalar_type;
162 typedef typename GraphType::device_type device_type;
165 typedef typename std::remove_const<typename GraphType::data_type>::type
174 void setX (
const InVecType& X) { X_ = X; }
182 void setY (
const OutVecType& Y) { Y_ = Y; }
184 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
185 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
188 BcrsApplyNoTransFunctor (
const alpha_coeff_type& alpha,
189 const GraphType& graph,
190 const MatrixValuesType& val,
191 const local_ordinal_type blockSize,
193 const beta_coeff_type& beta,
194 const OutVecType& Y) :
196 ptr_ (graph.row_map),
197 ind_ (graph.entries),
199 blockSize_ (blockSize),
206 KOKKOS_INLINE_FUNCTION
void
207 operator () (
const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member)
const
209 Kokkos::abort(
"Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
213 KOKKOS_INLINE_FUNCTION
void
214 operator () (
const local_ordinal_type& lclRow)
const
216 using ::Tpetra::FILL;
217 using ::Tpetra::SCAL;
218 using ::Tpetra::GEMV;
219 using Kokkos::Details::ArithTraits;
225 using Kokkos::parallel_for;
226 using Kokkos::subview;
227 typedef typename decltype (ptr_)::non_const_value_type offset_type;
228 typedef Kokkos::View<
typename MatrixValuesType::const_value_type**,
231 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
234 const offset_type Y_ptBeg = lclRow * blockSize_;
235 const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
236 auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
240 if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
241 FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
243 else if (beta_ != ArithTraits<beta_coeff_type>::one ()) {
247 if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
248 const offset_type blkBeg = ptr_[lclRow];
249 const offset_type blkEnd = ptr_[lclRow+1];
251 const offset_type bs2 = blockSize_ * blockSize_;
252 for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
254 little_block_type A_cur (val_.data () + absBlkOff * bs2,
255 blockSize_, blockSize_);
256 const offset_type X_blkCol = ind_[absBlkOff];
257 const offset_type X_ptBeg = X_blkCol * blockSize_;
258 const offset_type X_ptEnd = X_ptBeg + blockSize_;
259 auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
261 GEMV (alpha_, A_cur, X_cur, Y_cur);
267 alpha_coeff_type alpha_;
268 typename GraphType::row_map_type::const_type ptr_;
269 typename GraphType::entries_type::const_type ind_;
270 MatrixValuesType val_;
273 beta_coeff_type beta_;
277 template<
class AlphaCoeffType,
279 class MatrixValuesType,
283 class BcrsApplyNoTransFunctor<AlphaCoeffType,
291 static_assert (Kokkos::is_view<MatrixValuesType>::value,
292 "MatrixValuesType must be a Kokkos::View.");
293 static_assert (Kokkos::is_view<OutVecType>::value,
294 "OutVecType must be a Kokkos::View.");
295 static_assert (Kokkos::is_view<InVecType>::value,
296 "InVecType must be a Kokkos::View.");
297 static_assert (std::is_same<MatrixValuesType,
298 typename MatrixValuesType::const_type>::value,
299 "MatrixValuesType must be a const Kokkos::View.");
300 static_assert (std::is_same<
typename OutVecType::value_type,
301 typename OutVecType::non_const_value_type>::value,
302 "OutVecType must be a nonconst Kokkos::View.");
303 static_assert (std::is_same<
typename InVecType::value_type,
304 typename InVecType::const_value_type>::value,
305 "InVecType must be a const Kokkos::View.");
306 static_assert (
static_cast<int> (MatrixValuesType::rank) == 1,
307 "MatrixValuesType must be a rank-1 Kokkos::View.");
308 static_assert (
static_cast<int> (InVecType::rank) == 1,
309 "InVecType must be a rank-1 Kokkos::View.");
310 static_assert (
static_cast<int> (OutVecType::rank) == 1,
311 "OutVecType must be a rank-1 Kokkos::View.");
312 typedef typename MatrixValuesType::non_const_value_type scalar_type;
315 typedef typename GraphType::device_type device_type;
318 static constexpr bool runOnHost = !std::is_same_v<typename device_type::execution_space, Kokkos::DefaultExecutionSpace> || std::is_same_v<Kokkos::DefaultExecutionSpace, Kokkos::DefaultHostExecutionSpace>;
323 typedef typename std::remove_const<typename GraphType::data_type>::type
332 void setX (
const InVecType& X) { X_ = X; }
340 void setY (
const OutVecType& Y) { Y_ = Y; }
342 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
343 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
346 BcrsApplyNoTransFunctor (
const alpha_coeff_type& alpha,
347 const GraphType& graph,
348 const MatrixValuesType& val,
349 const local_ordinal_type blockSize,
351 const beta_coeff_type& beta,
352 const OutVecType& Y) :
354 ptr_ (graph.row_map),
355 ind_ (graph.entries),
357 blockSize_ (blockSize),
364 KOKKOS_INLINE_FUNCTION
void
365 operator () (
const local_ordinal_type& lclRow)
const
367 Kokkos::abort(
"Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
371 KOKKOS_INLINE_FUNCTION
void
372 operator () (
const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member)
const
377 using Kokkos::Details::ArithTraits;
383 using Kokkos::parallel_for;
384 using Kokkos::subview;
385 typedef typename decltype (ptr_)::non_const_value_type offset_type;
386 typedef Kokkos::View<
typename MatrixValuesType::const_value_type**,
389 Kokkos::MemoryTraits<Kokkos::Unmanaged> >
392 const offset_type Y_ptBeg = lclRow * blockSize_;
393 const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
394 auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
398 if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
399 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
400 [&](
const local_ordinal_type &i) {
401 Y_cur(i) = ArithTraits<beta_coeff_type>::zero ();
404 else if (beta_ != ArithTraits<beta_coeff_type>::one ()) {
405 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
406 [&](
const local_ordinal_type &i) {
410 member.team_barrier();
412 if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
413 const offset_type blkBeg = ptr_[lclRow];
414 const offset_type blkEnd = ptr_[lclRow+1];
416 const offset_type bs2 = blockSize_ * blockSize_;
417 little_block_type A_cur (val_.data (), blockSize_, blockSize_);
418 auto X_cur = subview (X_, ::Kokkos::make_pair (0, blockSize_));
420 (Kokkos::TeamThreadRange(member, blkBeg, blkEnd),
421 [&](
const local_ordinal_type &absBlkOff) {
422 A_cur.assign_data(val_.data () + absBlkOff * bs2);
423 const offset_type X_blkCol = ind_[absBlkOff];
424 const offset_type X_ptBeg = X_blkCol * blockSize_;
425 X_cur.assign_data(&X_(X_ptBeg));
428 (Kokkos::ThreadVectorRange(member, blockSize_),
429 [&](
const local_ordinal_type &k0) {
431 for (local_ordinal_type k1=0;k1<blockSize_;++k1)
432 val += A_cur(k0,k1)*X_cur(k1);
433 if constexpr(runOnHost) {
435 Y_cur(k0) += alpha_*val;
442 Kokkos::atomic_add(&Y_cur(k0), alpha_*val);
450 alpha_coeff_type alpha_;
451 typename GraphType::row_map_type::const_type ptr_;
452 typename GraphType::entries_type::const_type ind_;
453 MatrixValuesType val_;
456 beta_coeff_type beta_;
461 template<
class AlphaCoeffType,
463 class MatrixValuesType,
464 class InMultiVecType,
466 class OutMultiVecType>
468 bcrsLocalApplyNoTrans (
const AlphaCoeffType& alpha,
469 const GraphType& graph,
470 const MatrixValuesType& val,
471 const typename std::remove_const<typename GraphType::data_type>::type blockSize,
472 const InMultiVecType& X,
473 const BetaCoeffType& beta,
474 const OutMultiVecType& Y
477 static_assert (Kokkos::is_view<MatrixValuesType>::value,
478 "MatrixValuesType must be a Kokkos::View.");
479 static_assert (Kokkos::is_view<OutMultiVecType>::value,
480 "OutMultiVecType must be a Kokkos::View.");
481 static_assert (Kokkos::is_view<InMultiVecType>::value,
482 "InMultiVecType must be a Kokkos::View.");
483 static_assert (
static_cast<int> (MatrixValuesType::rank) == 1,
484 "MatrixValuesType must be a rank-1 Kokkos::View.");
485 static_assert (
static_cast<int> (OutMultiVecType::rank) == 2,
486 "OutMultiVecType must be a rank-2 Kokkos::View.");
487 static_assert (
static_cast<int> (InMultiVecType::rank) == 2,
488 "InMultiVecType must be a rank-2 Kokkos::View.");
490 typedef typename MatrixValuesType::device_type::execution_space execution_space;
491 typedef typename MatrixValuesType::device_type::memory_space memory_space;
492 typedef typename MatrixValuesType::const_type matrix_values_type;
493 typedef typename OutMultiVecType::non_const_type out_multivec_type;
494 typedef typename InMultiVecType::const_type in_multivec_type;
495 typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
496 typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
497 typedef typename std::remove_const<typename GraphType::data_type>::type LO;
499 constexpr bool is_builtin_type_enabled = std::is_arithmetic<typename InMultiVecType::non_const_value_type>::value;
500 constexpr bool is_host_memory_space = std::is_same<memory_space,Kokkos::HostSpace>::value;
501 constexpr bool use_team_policy = (is_builtin_type_enabled && !is_host_memory_space);
503 const LO numLocalMeshRows = graph.row_map.extent (0) == 0 ?
504 static_cast<LO
> (0) :
505 static_cast<LO
> (graph.row_map.extent (0) - 1);
506 const LO numVecs = Y.extent (1);
507 if (numLocalMeshRows == 0 || numVecs == 0) {
514 in_multivec_type X_in = X;
515 out_multivec_type Y_out = Y;
520 typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
521 typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
522 typedef BcrsApplyNoTransFunctor<alpha_type, GraphType,
523 matrix_values_type, in_vec_type, beta_type, out_vec_type,
524 use_team_policy> functor_type;
526 auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
527 auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
530 if (use_team_policy) {
531 functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
533 typedef Kokkos::TeamPolicy<execution_space> policy_type;
534 policy_type policy(1,1);
535#if defined(KOKKOS_ENABLE_CUDA)
536 constexpr bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
538 constexpr bool is_cuda =
false;
541 LO team_size = 8, vector_size = 1;
542 if (blockSize <= 5) vector_size = 4;
543 else if (blockSize <= 9) vector_size = 8;
544 else if (blockSize <= 12) vector_size = 12;
545 else if (blockSize <= 20) vector_size = 20;
546 else vector_size = 20;
547 policy = policy_type(numLocalMeshRows, team_size, vector_size);
549 policy = policy_type(numLocalMeshRows, 1, 1);
551 Kokkos::parallel_for (policy, functor);
554 for (LO j = 1; j < numVecs; ++j) {
555 auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
556 auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
559 Kokkos::parallel_for (policy, functor);
562 functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
563 Kokkos::RangePolicy<execution_space,LO> policy(0, numLocalMeshRows);
564 Kokkos::parallel_for (policy, functor);
565 for (LO j = 1; j < numVecs; ++j) {
566 auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
567 auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
570 Kokkos::parallel_for (policy, functor);
580template<
class Scalar,
class LO,
class GO,
class Node>
581class GetLocalDiagCopy {
583 typedef typename Node::device_type device_type;
584 typedef size_t diag_offset_type;
585 typedef Kokkos::View<
const size_t*, device_type,
586 Kokkos::MemoryUnmanaged> diag_offsets_type;
587 typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
589 typedef typename local_graph_device_type::row_map_type row_offsets_type;
590 typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
591 typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
592 typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
595 GetLocalDiagCopy (
const diag_type& diag,
596 const values_type& val,
597 const diag_offsets_type& diagOffsets,
598 const row_offsets_type& ptr,
599 const LO blockSize) :
601 diagOffsets_ (diagOffsets),
603 blockSize_ (blockSize),
604 offsetPerBlock_ (blockSize_*blockSize_),
608 KOKKOS_INLINE_FUNCTION
void
609 operator() (
const LO& lclRowInd)
const
614 const size_t absOffset = ptr_[lclRowInd];
617 const size_t relOffset = diagOffsets_[lclRowInd];
620 const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
625 device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
626 const_little_block_type;
627 const_little_block_type D_in (val_.data () + pointOffset,
628 blockSize_, blockSize_);
629 auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
635 diag_offsets_type diagOffsets_;
636 row_offsets_type ptr_;
643 template<
class Scalar,
class LO,
class GO,
class Node>
645 BlockCrsMatrix<Scalar, LO, GO, Node>::
646 markLocalErrorAndGetStream ()
648 * (this->localError_) =
true;
649 if ((*errs_).is_null ()) {
650 *errs_ = Teuchos::rcp (
new std::ostringstream ());
655 template<
class Scalar,
class LO,
class GO,
class Node>
659 graph_ (Teuchos::rcp (new
map_type ()), 0),
660 blockSize_ (static_cast<LO> (0)),
661 X_colMap_ (new Teuchos::RCP<
BMV> ()),
662 Y_rowMap_ (new Teuchos::RCP<
BMV> ()),
663 pointImporter_ (new Teuchos::RCP<typename
crs_graph_type::import_type> ()),
665 localError_ (new bool (false)),
666 errs_ (new Teuchos::RCP<std::ostringstream> ())
670 template<
class Scalar,
class LO,
class GO,
class Node>
673 const LO blockSize) :
676 rowMeshMap_ (* (graph.getRowMap ())),
677 blockSize_ (blockSize),
678 X_colMap_ (new Teuchos::RCP<
BMV> ()),
679 Y_rowMap_ (new Teuchos::RCP<
BMV> ()),
680 pointImporter_ (new Teuchos::RCP<typename
crs_graph_type::import_type> ()),
681 offsetPerBlock_ (blockSize * blockSize),
682 localError_ (new bool (false)),
683 errs_ (new Teuchos::RCP<std::ostringstream> ())
687 TEUCHOS_TEST_FOR_EXCEPTION(
688 ! graph_.
isSorted (), std::invalid_argument,
"Tpetra::"
689 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
690 "rows (isSorted() is false). This class assumes sorted rows.");
692 graphRCP_ = Teuchos::rcpFromRef(graph_);
698 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
699 TEUCHOS_TEST_FOR_EXCEPTION(
700 blockSizeIsNonpositive, std::invalid_argument,
"Tpetra::"
701 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
702 " <= 0. The block size must be positive.");
710 const auto colPointMap = Teuchos::rcp
712 *pointImporter_ = Teuchos::rcp
716 auto local_graph_h = graph.getLocalGraphHost ();
717 auto ptr_h = local_graph_h.row_map;
718 ptrHost_ =
decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing(
"graph row offset"), ptr_h.extent(0));
719 Kokkos::deep_copy(ptrHost_, ptr_h);
721 auto ind_h = local_graph_h.entries;
722 indHost_ =
decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing(
"graph column indices"), ind_h.extent(0));
724 Kokkos::deep_copy (indHost_, ind_h);
726 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
727 val_ =
decltype (val_) (impl_scalar_type_dualview(
"val", numValEnt));
731 template<
class Scalar,
class LO,
class GO,
class Node>
734 const typename local_matrix_device_type::values_type& values,
735 const LO blockSize) :
738 rowMeshMap_ (* (graph.getRowMap ())),
739 blockSize_ (blockSize),
740 X_colMap_ (new Teuchos::RCP<
BMV> ()),
741 Y_rowMap_ (new Teuchos::RCP<
BMV> ()),
742 pointImporter_ (new Teuchos::RCP<typename
crs_graph_type::import_type> ()),
743 offsetPerBlock_ (blockSize * blockSize),
744 localError_ (new bool (false)),
745 errs_ (new Teuchos::RCP<std::ostringstream> ())
748 TEUCHOS_TEST_FOR_EXCEPTION(
749 ! graph_.
isSorted (), std::invalid_argument,
"Tpetra::"
750 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
751 "rows (isSorted() is false). This class assumes sorted rows.");
753 graphRCP_ = Teuchos::rcpFromRef(graph_);
759 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
760 TEUCHOS_TEST_FOR_EXCEPTION(
761 blockSizeIsNonpositive, std::invalid_argument,
"Tpetra::"
762 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
763 " <= 0. The block size must be positive.");
771 const auto colPointMap = Teuchos::rcp
773 *pointImporter_ = Teuchos::rcp
777 auto local_graph_h = graph.getLocalGraphHost ();
778 auto ptr_h = local_graph_h.row_map;
779 ptrHost_ =
decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing(
"graph row offset"), ptr_h.extent(0));
780 Kokkos::deep_copy(ptrHost_, ptr_h);
782 auto ind_h = local_graph_h.entries;
783 indHost_ =
decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing(
"graph column indices"), ind_h.extent(0));
784 Kokkos::deep_copy (indHost_, ind_h);
786 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
787 TEUCHOS_ASSERT_EQUALITY(numValEnt, values.size());
788 val_ =
decltype (val_) (values);
792 template<
class Scalar,
class LO,
class GO,
class Node>
797 const LO blockSize) :
800 rowMeshMap_ (* (graph.getRowMap ())),
801 domainPointMap_ (domainPointMap),
802 rangePointMap_ (rangePointMap),
803 blockSize_ (blockSize),
804 X_colMap_ (new Teuchos::RCP<
BMV> ()),
805 Y_rowMap_ (new Teuchos::RCP<
BMV> ()),
806 pointImporter_ (new Teuchos::RCP<typename
crs_graph_type::import_type> ()),
807 offsetPerBlock_ (blockSize * blockSize),
808 localError_ (new bool (false)),
809 errs_ (new Teuchos::RCP<std::ostringstream> ())
811 TEUCHOS_TEST_FOR_EXCEPTION(
812 ! graph_.
isSorted (), std::invalid_argument,
"Tpetra::"
813 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
814 "rows (isSorted() is false). This class assumes sorted rows.");
816 graphRCP_ = Teuchos::rcpFromRef(graph_);
822 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
823 TEUCHOS_TEST_FOR_EXCEPTION(
824 blockSizeIsNonpositive, std::invalid_argument,
"Tpetra::"
825 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
826 " <= 0. The block size must be positive.");
830 const auto colPointMap = Teuchos::rcp
832 *pointImporter_ = Teuchos::rcp
836 auto local_graph_h = graph.getLocalGraphHost ();
837 auto ptr_h = local_graph_h.row_map;
838 ptrHost_ =
decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing(
"graph row offset"), ptr_h.extent(0));
840 Kokkos::deep_copy(ptrHost_, ptr_h);
842 auto ind_h = local_graph_h.entries;
843 indHost_ =
decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing(
"graph column indices"), ind_h.extent(0));
845 Kokkos::deep_copy (indHost_, ind_h);
847 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
848 val_ =
decltype (val_) (impl_scalar_type_dualview(
"val", numValEnt));
853 template<
class Scalar,
class LO,
class GO,
class Node>
854 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
859 return Teuchos::rcp (
new map_type (domainPointMap_));
862 template<
class Scalar,
class LO,
class GO,
class Node>
863 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
868 return Teuchos::rcp (
new map_type (rangePointMap_));
871 template<
class Scalar,
class LO,
class GO,
class Node>
872 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
876 return graph_.getRowMap();
879 template<
class Scalar,
class LO,
class GO,
class Node>
880 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
884 return graph_.getColMap();
887 template<
class Scalar,
class LO,
class GO,
class Node>
892 return graph_.getGlobalNumRows();
895 template<
class Scalar,
class LO,
class GO,
class Node>
900 return graph_.getLocalNumRows();
903 template<
class Scalar,
class LO,
class GO,
class Node>
908 return graph_.getLocalMaxNumRowEntries();
911 template<
class Scalar,
class LO,
class GO,
class Node>
916 Teuchos::ETransp mode,
921 TEUCHOS_TEST_FOR_EXCEPTION(
922 mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
923 std::invalid_argument,
"Tpetra::BlockCrsMatrix::apply: "
924 "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
925 "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
929 const LO blockSize = getBlockSize ();
931 X_view =
BMV (X, * (graph_.getDomainMap ()), blockSize);
932 Y_view =
BMV (Y, * (graph_.getRangeMap ()), blockSize);
934 catch (std::invalid_argument& e) {
935 TEUCHOS_TEST_FOR_EXCEPTION(
936 true, std::invalid_argument,
"Tpetra::BlockCrsMatrix::"
937 "apply: Either the input MultiVector X or the output MultiVector Y "
938 "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
939 "graph. BlockMultiVector's constructor threw the following exception: "
948 const_cast<this_BCRS_type&
> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
949 }
catch (std::invalid_argument& e) {
950 TEUCHOS_TEST_FOR_EXCEPTION(
951 true, std::invalid_argument,
"Tpetra::BlockCrsMatrix::"
952 "apply: The implementation method applyBlock complained about having "
953 "an invalid argument. It reported the following: " << e.what ());
954 }
catch (std::logic_error& e) {
955 TEUCHOS_TEST_FOR_EXCEPTION(
956 true, std::invalid_argument,
"Tpetra::BlockCrsMatrix::"
957 "apply: The implementation method applyBlock complained about a "
958 "possible bug in its implementation. It reported the following: "
960 }
catch (std::exception& e) {
961 TEUCHOS_TEST_FOR_EXCEPTION(
962 true, std::invalid_argument,
"Tpetra::BlockCrsMatrix::"
963 "apply: The implementation method applyBlock threw an exception which "
964 "is neither std::invalid_argument nor std::logic_error, but is a "
965 "subclass of std::exception. It reported the following: "
968 TEUCHOS_TEST_FOR_EXCEPTION(
969 true, std::logic_error,
"Tpetra::BlockCrsMatrix::"
970 "apply: The implementation method applyBlock threw an exception which "
971 "is not an instance of a subclass of std::exception. This probably "
972 "indicates a bug. Please report this to the Tpetra developers.");
976 template<
class Scalar,
class LO,
class GO,
class Node>
981 Teuchos::ETransp mode,
985 TEUCHOS_TEST_FOR_EXCEPTION(
987 "Tpetra::BlockCrsMatrix::applyBlock: "
988 "X and Y have different block sizes. X.getBlockSize() = "
992 if (mode == Teuchos::NO_TRANS) {
993 applyBlockNoTrans (X, Y, alpha, beta);
994 }
else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
995 applyBlockTrans (X, Y, mode, alpha, beta);
997 TEUCHOS_TEST_FOR_EXCEPTION(
998 true, std::invalid_argument,
"Tpetra::BlockCrsMatrix::"
999 "applyBlock: Invalid 'mode' argument. Valid values are "
1000 "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
1004 template<
class Scalar,
class LO,
class GO,
class Node>
1015 TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
1016 "destMatrix is required to be null.");
1020 RCP<crs_graph_type> srcGraph = rcp (
new crs_graph_type(this->getCrsGraph()));
1021 RCP<crs_graph_type> destGraph = importAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, importer,
1022 srcGraph->getDomainMap(),
1023 srcGraph->getRangeMap());
1027 destMatrix = rcp (
new this_BCRS_type (*destGraph, getBlockSize()));
1032 template<
class Scalar,
class LO,
class GO,
class Node>
1043 TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
1044 "destMatrix is required to be null.");
1048 RCP<crs_graph_type> srcGraph = rcp (
new crs_graph_type(this->getCrsGraph()));
1049 RCP<crs_graph_type> destGraph = exportAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, exporter,
1050 srcGraph->getDomainMap(),
1051 srcGraph->getRangeMap());
1055 destMatrix = rcp (
new this_BCRS_type (*destGraph, getBlockSize()));
1060 template<
class Scalar,
class LO,
class GO,
class Node>
1065 auto val_d = val_.getDeviceView(Access::OverwriteAll);
1066 Kokkos::deep_copy(val_d, alpha);
1069 template<
class Scalar,
class LO,
class GO,
class Node>
1074 const Scalar vals[],
1075 const LO numColInds)
const
1077 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1078 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing(
"offsets"), numColInds);
1079 ptrdiff_t * offsets = offsets_host_view.data();
1080 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1081 const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1085 template <
class Scalar,
class LO,
class GO,
class Node>
1089 Kokkos::MemoryUnmanaged>& offsets)
const
1091 graph_.getLocalDiagOffsets (offsets);
1094 template <
class Scalar,
class LO,
class GO,
class Node>
1098 Kokkos::MemoryUnmanaged>& diag,
1100 Kokkos::MemoryUnmanaged>& offsets)
const
1102 using Kokkos::parallel_for;
1103 const char prefix[] =
"Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
1105 const LO lclNumMeshRows =
static_cast<LO
> (rowMeshMap_.getLocalNumElements ());
1106 const LO blockSize = this->getBlockSize ();
1107 TEUCHOS_TEST_FOR_EXCEPTION
1108 (
static_cast<LO
> (diag.extent (0)) < lclNumMeshRows ||
1109 static_cast<LO
> (diag.extent (1)) < blockSize ||
1110 static_cast<LO
> (diag.extent (2)) < blockSize,
1111 std::invalid_argument, prefix <<
1112 "The input Kokkos::View is not big enough to hold all the data.");
1113 TEUCHOS_TEST_FOR_EXCEPTION
1114 (
static_cast<LO
> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
1115 prefix <<
"offsets.size() = " << offsets.size () <<
" < local number of "
1116 "diagonal blocks " << lclNumMeshRows <<
".");
1118 typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
1119 typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
1125 auto val_d = val_.getDeviceView(Access::ReadOnly);
1126 parallel_for (policy_type (0, lclNumMeshRows),
1127 functor_type (diag, val_d, offsets,
1128 graph_.getLocalGraphDevice ().row_map, blockSize_));
1131 template<
class Scalar,
class LO,
class GO,
class Node>
1136 const Scalar vals[],
1137 const LO numColInds)
const
1139 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1140 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing(
"offsets"), numColInds);
1141 ptrdiff_t * offsets = offsets_host_view.data();
1142 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1143 const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1148 template<
class Scalar,
class LO,
class GO,
class Node>
1153 const Scalar vals[],
1154 const LO numColInds)
const
1156 Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1157 offsets_host_view(Kokkos::ViewAllocateWithoutInitializing(
"offsets"), numColInds);
1158 ptrdiff_t * offsets = offsets_host_view.data();
1159 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1160 const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1163 template<
class Scalar,
class LO,
class GO,
class Node>
1167 nonconst_local_inds_host_view_type &Indices,
1168 nonconst_values_host_view_type &Values,
1169 size_t& NumEntries)
const
1171 auto vals = getValuesHost(LocalRow);
1172 const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
1173 if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
1174 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::runtime_error,
1175 "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1176 << numInds <<
" row entries");
1178 const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
1179 for (LO i=0; i<numInds; ++i) {
1180 Indices[i] = colInds[i];
1182 for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1183 Values[i] = vals[i];
1185 NumEntries = numInds;
1188 template<
class Scalar,
class LO,
class GO,
class Node>
1192 ptrdiff_t offsets[],
1194 const LO numColInds)
const
1196 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1201 return static_cast<LO
> (0);
1204 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1208 for (LO k = 0; k < numColInds; ++k) {
1209 const LO relBlockOffset =
1210 this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1211 if (relBlockOffset != LINV) {
1212 offsets[k] =
static_cast<ptrdiff_t
> (relBlockOffset);
1213 hint = relBlockOffset + 1;
1221 template<
class Scalar,
class LO,
class GO,
class Node>
1225 const ptrdiff_t offsets[],
1226 const Scalar vals[],
1227 const LO numOffsets)
const
1229 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1234 return static_cast<LO
> (0);
1241 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1242 const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
1243 size_t pointOffset = 0;
1246 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1247 const size_t blockOffset = offsets[k]*perBlockSize;
1248 if (offsets[k] != STINV) {
1251 COPY (A_new, A_old);
1259 template<
class Scalar,
class LO,
class GO,
class Node>
1263 const ptrdiff_t offsets[],
1264 const Scalar vals[],
1265 const LO numOffsets)
const
1267 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1272 return static_cast<LO
> (0);
1274 const impl_scalar_type*
const vIn =
reinterpret_cast<const impl_scalar_type*
> (vals);
1275 auto val_out = getValuesHost(localRowInd);
1276 impl_scalar_type* vOut =
const_cast<impl_scalar_type*
>(val_out.data());
1278 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1279 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1280 size_t pointOffset = 0;
1283 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1284 const size_t blockOffset = offsets[k]*perBlockSize;
1285 if (blockOffset != STINV) {
1286 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1287 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1296 template<
class Scalar,
class LO,
class GO,
class Node>
1300 const ptrdiff_t offsets[],
1301 const Scalar vals[],
1302 const LO numOffsets)
const
1304 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1309 return static_cast<LO
> (0);
1317 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1318 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1319 size_t pointOffset = 0;
1322 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1323 const size_t blockOffset = offsets[k]*perBlockSize;
1324 if (blockOffset != STINV) {
1327 AXPY (ONE, A_new, A_old);
1334 template<
class Scalar,
class LO,
class GO,
class Node>
1336 impl_scalar_type_dualview::t_host::const_type
1340 return val_.getHostView(Access::ReadOnly);
1343 template<
class Scalar,
class LO,
class GO,
class Node>
1344 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1345 impl_scalar_type_dualview::t_dev::const_type
1346 BlockCrsMatrix<Scalar, LO, GO, Node>::
1347 getValuesDevice ()
const
1349 return val_.getDeviceView(Access::ReadOnly);
1352 template<
class Scalar,
class LO,
class GO,
class Node>
1353 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1354 impl_scalar_type_dualview::t_host
1358 return val_.getHostView(Access::ReadWrite);
1361 template<
class Scalar,
class LO,
class GO,
class Node>
1363 impl_scalar_type_dualview::t_dev
1367 return val_.getDeviceView(Access::ReadWrite);
1370 template<
class Scalar,
class LO,
class GO,
class Node>
1371 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1372 impl_scalar_type_dualview::t_host::const_type
1376 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1377 auto val = val_.getHostView(Access::ReadOnly);
1378 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1382 template<
class Scalar,
class LO,
class GO,
class Node>
1384 impl_scalar_type_dualview::t_dev::const_type
1388 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1389 auto val = val_.getDeviceView(Access::ReadOnly);
1390 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1394 template<
class Scalar,
class LO,
class GO,
class Node>
1399 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1400 auto val = val_.getHostView(Access::ReadWrite);
1401 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1405 template<
class Scalar,
class LO,
class GO,
class Node>
1410 const size_t perBlockSize =
static_cast<LO
> (offsetPerBlock ());
1411 auto val = val_.getDeviceView(Access::ReadWrite);
1412 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1416 template<
class Scalar,
class LO,
class GO,
class Node>
1421 const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1422 if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1423 return static_cast<LO
> (0);
1425 return static_cast<LO
> (numEntInGraph);
1429 template<
class Scalar,
class LO,
class GO,
class Node>
1430 typename BlockCrsMatrix<Scalar, LO, GO, Node>::local_matrix_device_type
1434 auto numCols = this->graph_.getColMap()->getLocalNumElements();
1435 auto val = val_.getDeviceView(Access::ReadWrite);
1436 const LO blockSize = this->getBlockSize ();
1437 const auto graph = this->graph_.getLocalGraphDevice ();
1439 return local_matrix_device_type(
"Tpetra::BlockCrsMatrix::lclMatrixDevice",
1446 template<
class Scalar,
class LO,
class GO,
class Node>
1451 const Teuchos::ETransp mode,
1461 TEUCHOS_TEST_FOR_EXCEPTION(
1462 true, std::logic_error,
"Tpetra::BlockCrsMatrix::apply: "
1463 "transpose and conjugate transpose modes are not yet implemented.");
1466 template<
class Scalar,
class LO,
class GO,
class Node>
1468 BlockCrsMatrix<Scalar, LO, GO, Node>::
1469 applyBlockNoTrans (
const BlockMultiVector<Scalar, LO, GO, Node>& X,
1470 BlockMultiVector<Scalar, LO, GO, Node>& Y,
1476 typedef ::Tpetra::Import<LO, GO, Node> import_type;
1477 typedef ::Tpetra::Export<LO, GO, Node> export_type;
1478 const Scalar zero = STS::zero ();
1479 const Scalar one = STS::one ();
1480 RCP<const import_type>
import = graph_.getImporter ();
1482 RCP<const export_type> theExport = graph_.getExporter ();
1483 const char prefix[] =
"Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1485 if (alpha == zero) {
1489 else if (beta != one) {
1494 const BMV* X_colMap = NULL;
1495 if (
import.is_null ()) {
1499 catch (std::exception& e) {
1500 TEUCHOS_TEST_FOR_EXCEPTION
1501 (
true, std::logic_error, prefix <<
"Tpetra::MultiVector::"
1502 "operator= threw an exception: " << std::endl << e.what ());
1512 if ((*X_colMap_).is_null () ||
1513 (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1514 (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1515 *X_colMap_ = rcp (
new BMV (* (graph_.getColMap ()), getBlockSize (),
1516 static_cast<LO
> (X.getNumVectors ())));
1518 (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1522 X_colMap = &(**X_colMap_);
1524 catch (std::exception& e) {
1525 TEUCHOS_TEST_FOR_EXCEPTION
1526 (
true, std::logic_error, prefix <<
"Tpetra::MultiVector::"
1527 "operator= threw an exception: " << std::endl << e.what ());
1531 BMV* Y_rowMap = NULL;
1532 if (theExport.is_null ()) {
1535 else if ((*Y_rowMap_).is_null () ||
1536 (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1537 (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1538 *Y_rowMap_ = rcp (
new BMV (* (graph_.getRowMap ()), getBlockSize (),
1539 static_cast<LO
> (X.getNumVectors ())));
1541 Y_rowMap = &(**Y_rowMap_);
1543 catch (std::exception& e) {
1544 TEUCHOS_TEST_FOR_EXCEPTION(
1545 true, std::logic_error, prefix <<
"Tpetra::MultiVector::"
1546 "operator= threw an exception: " << std::endl << e.what ());
1551 localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1553 catch (std::exception& e) {
1554 TEUCHOS_TEST_FOR_EXCEPTION
1555 (
true, std::runtime_error, prefix <<
"localApplyBlockNoTrans threw "
1556 "an exception: " << e.what ());
1559 TEUCHOS_TEST_FOR_EXCEPTION
1560 (
true, std::runtime_error, prefix <<
"localApplyBlockNoTrans threw "
1561 "an exception not a subclass of std::exception.");
1564 if (! theExport.is_null ()) {
1570 template<
class Scalar,
class LO,
class GO,
class Node>
1572 BlockCrsMatrix<Scalar, LO, GO, Node>::
1573 localApplyBlockNoTrans (
const BlockMultiVector<Scalar, LO, GO, Node>& X,
1574 BlockMultiVector<Scalar, LO, GO, Node>& Y,
1578 using ::Tpetra::Impl::bcrsLocalApplyNoTrans;
1580 const impl_scalar_type alpha_impl = alpha;
1581 const auto graph = this->graph_.getLocalGraphDevice ();
1582 const impl_scalar_type beta_impl = beta;
1583 const LO blockSize = this->getBlockSize ();
1585 auto X_mv = X.getMultiVectorView ();
1586 auto Y_mv = Y.getMultiVectorView ();
1590 auto X_lcl = X_mv.getLocalViewDevice (Access::ReadOnly);
1591 auto Y_lcl = Y_mv.getLocalViewDevice (Access::ReadWrite);
1592 auto val = val_.getDeviceView(Access::ReadWrite);
1594 bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1598 template<
class Scalar,
class LO,
class GO,
class Node>
1600 BlockCrsMatrix<Scalar, LO, GO, Node>::
1601 findRelOffsetOfColumnIndex (
const LO localRowIndex,
1602 const LO colIndexToFind,
1603 const LO hint)
const
1605 const size_t absStartOffset = ptrHost_[localRowIndex];
1606 const size_t absEndOffset = ptrHost_[localRowIndex+1];
1607 const LO numEntriesInRow =
static_cast<LO
> (absEndOffset - absStartOffset);
1609 const LO*
const curInd = indHost_.data () + absStartOffset;
1612 if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1619 LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1624 const LO maxNumEntriesForLinearSearch = 32;
1625 if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1630 const LO* beg = curInd;
1631 const LO* end = curInd + numEntriesInRow;
1632 std::pair<const LO*, const LO*> p =
1633 std::equal_range (beg, end, colIndexToFind);
1634 if (p.first != p.second) {
1636 relOffset =
static_cast<LO
> (p.first - beg);
1640 for (LO k = 0; k < numEntriesInRow; ++k) {
1641 if (colIndexToFind == curInd[k]) {
1651 template<
class Scalar,
class LO,
class GO,
class Node>
1653 BlockCrsMatrix<Scalar, LO, GO, Node>::
1654 offsetPerBlock ()
const
1656 return offsetPerBlock_;
1659 template<
class Scalar,
class LO,
class GO,
class Node>
1661 BlockCrsMatrix<Scalar, LO, GO, Node>::
1662 getConstLocalBlockFromInput (
const impl_scalar_type* val,
1663 const size_t pointOffset)
const
1666 const LO rowStride = blockSize_;
1667 return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1670 template<
class Scalar,
class LO,
class GO,
class Node>
1672 BlockCrsMatrix<Scalar, LO, GO, Node>::
1673 getNonConstLocalBlockFromInput (impl_scalar_type* val,
1674 const size_t pointOffset)
const
1677 const LO rowStride = blockSize_;
1678 return little_block_type (val + pointOffset, blockSize_, rowStride);
1681 template<
class Scalar,
class LO,
class GO,
class Node>
1682 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1683 BlockCrsMatrix<Scalar, LO, GO, Node>::
1684 getNonConstLocalBlockFromInputHost (impl_scalar_type* val,
1685 const size_t pointOffset)
const
1688 const LO rowStride = blockSize_;
1689 const size_t bs2 = blockSize_ * blockSize_;
1690 return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
1693 template<
class Scalar,
class LO,
class GO,
class Node>
1695 BlockCrsMatrix<Scalar, LO, GO, Node>::
1696 getLocalBlockDeviceNonConst (
const LO localRowInd,
const LO localColInd)
const
1698 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1700 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1701 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1702 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1703 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1704 auto vals =
const_cast<this_BCRS_type&
>(*this).getValuesDeviceNonConst();
1705 auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1709 return little_block_type ();
1713 template<
class Scalar,
class LO,
class GO,
class Node>
1714 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1715 BlockCrsMatrix<Scalar, LO, GO, Node>::
1716 getLocalBlockHostNonConst (
const LO localRowInd,
const LO localColInd)
const
1718 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1720 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1721 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1722 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1723 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1724 auto vals =
const_cast<this_BCRS_type&
>(*this).getValuesHostNonConst();
1725 auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);
1729 return little_block_host_type ();
1734 template<
class Scalar,
class LO,
class GO,
class Node>
1736 BlockCrsMatrix<Scalar, LO, GO, Node>::
1737 checkSizes (const ::Tpetra::SrcDistObject& source)
1740 typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1741 const this_BCRS_type* src =
dynamic_cast<const this_BCRS_type*
> (&source);
1744 std::ostream& err = markLocalErrorAndGetStream ();
1745 err <<
"checkSizes: The source object of the Import or Export "
1746 "must be a BlockCrsMatrix with the same template parameters as the "
1747 "target object." << endl;
1752 if (src->getBlockSize () != this->getBlockSize ()) {
1753 std::ostream& err = markLocalErrorAndGetStream ();
1754 err <<
"checkSizes: The source and target objects of the Import or "
1755 <<
"Export must have the same block sizes. The source's block "
1756 <<
"size = " << src->getBlockSize () <<
" != the target's block "
1757 <<
"size = " << this->getBlockSize () <<
"." << endl;
1759 if (! src->graph_.isFillComplete ()) {
1760 std::ostream& err = markLocalErrorAndGetStream ();
1761 err <<
"checkSizes: The source object of the Import or Export is "
1762 "not fill complete. Both source and target objects must be fill "
1763 "complete." << endl;
1765 if (! this->graph_.isFillComplete ()) {
1766 std::ostream& err = markLocalErrorAndGetStream ();
1767 err <<
"checkSizes: The target object of the Import or Export is "
1768 "not fill complete. Both source and target objects must be fill "
1769 "complete." << endl;
1771 if (src->graph_.getColMap ().is_null ()) {
1772 std::ostream& err = markLocalErrorAndGetStream ();
1773 err <<
"checkSizes: The source object of the Import or Export does "
1774 "not have a column Map. Both source and target objects must have "
1775 "column Maps." << endl;
1777 if (this->graph_.getColMap ().is_null ()) {
1778 std::ostream& err = markLocalErrorAndGetStream ();
1779 err <<
"checkSizes: The target object of the Import or Export does "
1780 "not have a column Map. Both source and target objects must have "
1781 "column Maps." << endl;
1785 return ! (* (this->localError_));
1788 template<
class Scalar,
class LO,
class GO,
class Node>
1792 (const ::Tpetra::SrcDistObject& source,
1793 const size_t numSameIDs,
1800 using ::Tpetra::Details::Behavior;
1801 using ::Tpetra::Details::dualViewStatusToString;
1802 using ::Tpetra::Details::ProfilingRegion;
1806 ProfilingRegion profile_region(
"Tpetra::BlockCrsMatrix::copyAndPermute");
1807 const bool debug = Behavior::debug();
1808 const bool verbose = Behavior::verbose();
1813 std::ostringstream os;
1814 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1815 os <<
"Proc " << myRank <<
": BlockCrsMatrix::copyAndPermute : " << endl;
1820 if (* (this->localError_)) {
1821 std::ostream& err = this->markLocalErrorAndGetStream ();
1823 <<
"The target object of the Import or Export is already in an error state."
1832 std::ostringstream os;
1833 os << prefix << endl
1834 << prefix <<
" " << dualViewStatusToString (permuteToLIDs,
"permuteToLIDs") << endl
1835 << prefix <<
" " << dualViewStatusToString (permuteFromLIDs,
"permuteFromLIDs") << endl;
1836 std::cerr << os.str ();
1842 if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
1843 std::ostream& err = this->markLocalErrorAndGetStream ();
1845 <<
"permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
1846 <<
" != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1850 if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
1851 std::ostream& err = this->markLocalErrorAndGetStream ();
1853 <<
"Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1858 const this_BCRS_type* src =
dynamic_cast<const this_BCRS_type*
> (&source);
1860 std::ostream& err = this->markLocalErrorAndGetStream ();
1862 <<
"The source (input) object of the Import or "
1863 "Export is either not a BlockCrsMatrix, or does not have the right "
1864 "template parameters. checkSizes() should have caught this. "
1865 "Please report this bug to the Tpetra developers." << endl;
1869 bool lclErr =
false;
1870#ifdef HAVE_TPETRA_DEBUG
1871 std::set<LO> invalidSrcCopyRows;
1872 std::set<LO> invalidDstCopyRows;
1873 std::set<LO> invalidDstCopyCols;
1874 std::set<LO> invalidDstPermuteCols;
1875 std::set<LO> invalidPermuteFromRows;
1885#ifdef HAVE_TPETRA_DEBUG
1886 const map_type& srcRowMap = * (src->graph_.getRowMap ());
1888 const map_type& dstRowMap = * (this->graph_.getRowMap ());
1889 const map_type& srcColMap = * (src->graph_.getColMap ());
1890 const map_type& dstColMap = * (this->graph_.getColMap ());
1891 const bool canUseLocalColumnIndices = srcColMap.
locallySameAs (dstColMap);
1893 const size_t numPermute =
static_cast<size_t> (permuteFromLIDs.extent(0));
1895 std::ostringstream os;
1897 <<
"canUseLocalColumnIndices: "
1898 << (canUseLocalColumnIndices ?
"true" :
"false")
1899 <<
", numPermute: " << numPermute
1901 std::cerr << os.str ();
1904 const auto permuteToLIDsHost = permuteToLIDs.view_host();
1905 const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1907 if (canUseLocalColumnIndices) {
1909 for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1910#ifdef HAVE_TPETRA_DEBUG
1913 invalidSrcCopyRows.insert (localRow);
1918 local_inds_host_view_type lclSrcCols;
1919 values_host_view_type vals;
1923 src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1924 if (numEntries > 0) {
1925 LO err = this->replaceLocalValues (localRow, lclSrcCols.data(),
reinterpret_cast<const scalar_type*
>(vals.data()), numEntries);
1926 if (err != numEntries) {
1929#ifdef HAVE_TPETRA_DEBUG
1930 invalidDstCopyRows.insert (localRow);
1939 for (LO k = 0; k < numEntries; ++k) {
1942#ifdef HAVE_TPETRA_DEBUG
1943 (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1953 for (
size_t k = 0; k < numPermute; ++k) {
1954 const LO srcLclRow =
static_cast<LO
> (permuteFromLIDsHost(k));
1955 const LO dstLclRow =
static_cast<LO
> (permuteToLIDsHost(k));
1957 local_inds_host_view_type lclSrcCols;
1958 values_host_view_type vals;
1960 src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1961 if (numEntries > 0) {
1962 LO err = this->replaceLocalValues (dstLclRow, lclSrcCols.data(),
reinterpret_cast<const scalar_type*
>(vals.data()), numEntries);
1963 if (err != numEntries) {
1965#ifdef HAVE_TPETRA_DEBUG
1966 for (LO c = 0; c < numEntries; ++c) {
1968 invalidDstPermuteCols.insert (lclSrcCols[c]);
1978 const size_t maxNumEnt = src->graph_.getLocalMaxNumRowEntries ();
1979 Teuchos::Array<LO> lclDstCols (maxNumEnt);
1982 for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1983 local_inds_host_view_type lclSrcCols;
1984 values_host_view_type vals;
1990 src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1991 }
catch (std::exception& e) {
1993 std::ostringstream os;
1994 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1995 os <<
"Proc " << myRank <<
": copyAndPermute: At \"same\" localRow "
1996 << localRow <<
", src->getLocalRowView() threw an exception: "
1998 std::cerr << os.str ();
2003 if (numEntries > 0) {
2004 if (
static_cast<size_t> (numEntries) >
static_cast<size_t> (lclDstCols.size ())) {
2007 std::ostringstream os;
2008 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2009 os <<
"Proc " << myRank <<
": copyAndPermute: At \"same\" localRow "
2010 << localRow <<
", numEntries = " << numEntries <<
" > maxNumEnt = "
2011 << maxNumEnt << endl;
2012 std::cerr << os.str ();
2018 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2019 for (LO j = 0; j < numEntries; ++j) {
2021 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2023#ifdef HAVE_TPETRA_DEBUG
2024 invalidDstCopyCols.insert (lclDstColsView[j]);
2030 err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (),
reinterpret_cast<const scalar_type*
>(vals.data()), numEntries);
2031 }
catch (std::exception& e) {
2033 std::ostringstream os;
2034 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2035 os <<
"Proc " << myRank <<
": copyAndPermute: At \"same\" localRow "
2036 << localRow <<
", this->replaceLocalValues() threw an exception: "
2038 std::cerr << os.str ();
2042 if (err != numEntries) {
2045 std::ostringstream os;
2046 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2047 os <<
"Proc " << myRank <<
": copyAndPermute: At \"same\" "
2048 "localRow " << localRow <<
", this->replaceLocalValues "
2049 "returned " << err <<
" instead of numEntries = "
2050 << numEntries << endl;
2051 std::cerr << os.str ();
2059 for (
size_t k = 0; k < numPermute; ++k) {
2060 const LO srcLclRow =
static_cast<LO
> (permuteFromLIDsHost(k));
2061 const LO dstLclRow =
static_cast<LO
> (permuteToLIDsHost(k));
2063 local_inds_host_view_type lclSrcCols;
2064 values_host_view_type vals;
2068 src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
2069 }
catch (std::exception& e) {
2071 std::ostringstream os;
2072 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2073 os <<
"Proc " << myRank <<
": copyAndPermute: At \"permute\" "
2074 "srcLclRow " << srcLclRow <<
" and dstLclRow " << dstLclRow
2075 <<
", src->getLocalRowView() threw an exception: " << e.what ();
2076 std::cerr << os.str ();
2081 if (numEntries > 0) {
2082 if (
static_cast<size_t> (numEntries) >
static_cast<size_t> (lclDstCols.size ())) {
2088 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2089 for (LO j = 0; j < numEntries; ++j) {
2091 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2093#ifdef HAVE_TPETRA_DEBUG
2094 invalidDstPermuteCols.insert (lclDstColsView[j]);
2098 LO err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (),
reinterpret_cast<const scalar_type*
>(vals.data()), numEntries);
2099 if (err != numEntries) {
2108 std::ostream& err = this->markLocalErrorAndGetStream ();
2109#ifdef HAVE_TPETRA_DEBUG
2110 err <<
"copyAndPermute: The graph structure of the source of the "
2111 "Import or Export must be a subset of the graph structure of the "
2113 err <<
"invalidSrcCopyRows = [";
2114 for (
typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2115 it != invalidSrcCopyRows.end (); ++it) {
2117 typename std::set<LO>::const_iterator itp1 = it;
2119 if (itp1 != invalidSrcCopyRows.end ()) {
2123 err <<
"], invalidDstCopyRows = [";
2124 for (
typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2125 it != invalidDstCopyRows.end (); ++it) {
2127 typename std::set<LO>::const_iterator itp1 = it;
2129 if (itp1 != invalidDstCopyRows.end ()) {
2133 err <<
"], invalidDstCopyCols = [";
2134 for (
typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2135 it != invalidDstCopyCols.end (); ++it) {
2137 typename std::set<LO>::const_iterator itp1 = it;
2139 if (itp1 != invalidDstCopyCols.end ()) {
2143 err <<
"], invalidDstPermuteCols = [";
2144 for (
typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2145 it != invalidDstPermuteCols.end (); ++it) {
2147 typename std::set<LO>::const_iterator itp1 = it;
2149 if (itp1 != invalidDstPermuteCols.end ()) {
2153 err <<
"], invalidPermuteFromRows = [";
2154 for (
typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2155 it != invalidPermuteFromRows.end (); ++it) {
2157 typename std::set<LO>::const_iterator itp1 = it;
2159 if (itp1 != invalidPermuteFromRows.end ()) {
2165 err <<
"copyAndPermute: The graph structure of the source of the "
2166 "Import or Export must be a subset of the graph structure of the "
2172 std::ostringstream os;
2173 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2174 const bool lclSuccess = ! (* (this->localError_));
2175 os <<
"*** Proc " << myRank <<
": copyAndPermute "
2176 << (lclSuccess ?
"succeeded" :
"FAILED");
2180 os <<
": error messages: " << this->errorMessages ();
2182 std::cerr << os.str ();
2206 template<
class LO,
class GO>
2208 packRowCount (
const size_t numEnt,
2209 const size_t numBytesPerValue,
2210 const size_t blkSize)
2212 using ::Tpetra::Details::PackTraits;
2222 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2223 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2224 const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2225 return numEntLen + gidsLen + valsLen;
2239 template<
class ST,
class LO,
class GO>
2241 unpackRowCount (
const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
2242 const size_t offset,
2243 const size_t numBytes,
2246 using Kokkos::subview;
2247 using ::Tpetra::Details::PackTraits;
2249 if (numBytes == 0) {
2251 return static_cast<size_t> (0);
2255 const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
2256 TEUCHOS_TEST_FOR_EXCEPTION
2257 (theNumBytes > numBytes, std::logic_error,
"unpackRowCount: "
2258 "theNumBytes = " << theNumBytes <<
" < numBytes = " << numBytes
2260 const char*
const inBuf = imports.data () + offset;
2261 const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
2262 TEUCHOS_TEST_FOR_EXCEPTION
2263 (actualNumBytes > numBytes, std::logic_error,
"unpackRowCount: "
2264 "actualNumBytes = " << actualNumBytes <<
" < numBytes = " << numBytes
2266 return static_cast<size_t> (numEntLO);
2273 template<
class ST,
class LO,
class GO>
2275 packRowForBlockCrs (
const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
2276 const size_t offset,
2277 const size_t numEnt,
2278 const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
2279 const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
2280 const size_t numBytesPerValue,
2281 const size_t blockSize)
2283 using Kokkos::subview;
2284 using ::Tpetra::Details::PackTraits;
2290 const size_t numScalarEnt = numEnt * blockSize * blockSize;
2293 const LO numEntLO =
static_cast<size_t> (numEnt);
2295 const size_t numEntBeg = offset;
2296 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2297 const size_t gidsBeg = numEntBeg + numEntLen;
2298 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2299 const size_t valsBeg = gidsBeg + gidsLen;
2300 const size_t valsLen = numScalarEnt * numBytesPerValue;
2302 char*
const numEntOut = exports.data () + numEntBeg;
2303 char*
const gidsOut = exports.data () + gidsBeg;
2304 char*
const valsOut = exports.data () + valsBeg;
2306 size_t numBytesOut = 0;
2308 numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
2311 Kokkos::pair<int, size_t> p;
2312 p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
2313 errorCode += p.first;
2314 numBytesOut += p.second;
2316 p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
2317 errorCode += p.first;
2318 numBytesOut += p.second;
2321 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2322 TEUCHOS_TEST_FOR_EXCEPTION
2323 (numBytesOut != expectedNumBytes, std::logic_error,
2324 "packRowForBlockCrs: numBytesOut = " << numBytesOut
2325 <<
" != expectedNumBytes = " << expectedNumBytes <<
".");
2327 TEUCHOS_TEST_FOR_EXCEPTION
2328 (errorCode != 0, std::runtime_error,
"packRowForBlockCrs: "
2329 "PackTraits::packArray returned a nonzero error code");
2335 template<
class ST,
class LO,
class GO>
2337 unpackRowForBlockCrs (
const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
2338 const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
2339 const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
2340 const size_t offset,
2341 const size_t numBytes,
2342 const size_t numEnt,
2343 const size_t numBytesPerValue,
2344 const size_t blockSize)
2346 using ::Tpetra::Details::PackTraits;
2348 if (numBytes == 0) {
2352 const size_t numScalarEnt = numEnt * blockSize * blockSize;
2353 TEUCHOS_TEST_FOR_EXCEPTION
2354 (
static_cast<size_t> (imports.extent (0)) <= offset,
2355 std::logic_error,
"unpackRowForBlockCrs: imports.extent(0) = "
2356 << imports.extent (0) <<
" <= offset = " << offset <<
".");
2357 TEUCHOS_TEST_FOR_EXCEPTION
2358 (
static_cast<size_t> (imports.extent (0)) < offset + numBytes,
2359 std::logic_error,
"unpackRowForBlockCrs: imports.extent(0) = "
2360 << imports.extent (0) <<
" < offset + numBytes = "
2361 << (offset + numBytes) <<
".");
2366 const size_t numEntBeg = offset;
2367 const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
2368 const size_t gidsBeg = numEntBeg + numEntLen;
2369 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2370 const size_t valsBeg = gidsBeg + gidsLen;
2371 const size_t valsLen = numScalarEnt * numBytesPerValue;
2373 const char*
const numEntIn = imports.data () + numEntBeg;
2374 const char*
const gidsIn = imports.data () + gidsBeg;
2375 const char*
const valsIn = imports.data () + valsBeg;
2377 size_t numBytesOut = 0;
2380 numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
2381 TEUCHOS_TEST_FOR_EXCEPTION
2382 (
static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2383 "unpackRowForBlockCrs: Expected number of entries " << numEnt
2384 <<
" != actual number of entries " << numEntOut <<
".");
2387 Kokkos::pair<int, size_t> p;
2388 p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
2389 errorCode += p.first;
2390 numBytesOut += p.second;
2392 p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
2393 errorCode += p.first;
2394 numBytesOut += p.second;
2397 TEUCHOS_TEST_FOR_EXCEPTION
2398 (numBytesOut != numBytes, std::logic_error,
2399 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2400 <<
" != numBytes = " << numBytes <<
".");
2402 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2403 TEUCHOS_TEST_FOR_EXCEPTION
2404 (numBytesOut != expectedNumBytes, std::logic_error,
2405 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2406 <<
" != expectedNumBytes = " << expectedNumBytes <<
".");
2408 TEUCHOS_TEST_FOR_EXCEPTION
2409 (errorCode != 0, std::runtime_error,
"unpackRowForBlockCrs: "
2410 "PackTraits::unpackArray returned a nonzero error code");
2416 template<
class Scalar,
class LO,
class GO,
class Node>
2420 (const ::Tpetra::SrcDistObject& source,
2425 Kokkos::DualView<
size_t*,
2427 size_t& constantNumPackets)
2429 using ::Tpetra::Details::Behavior;
2430 using ::Tpetra::Details::dualViewStatusToString;
2431 using ::Tpetra::Details::ProfilingRegion;
2432 using ::Tpetra::Details::PackTraits;
2434 typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
2438 ProfilingRegion profile_region(
"Tpetra::BlockCrsMatrix::packAndPrepare");
2440 const bool debug = Behavior::debug();
2441 const bool verbose = Behavior::verbose();
2446 std::ostringstream os;
2447 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2448 os <<
"Proc " << myRank <<
": BlockCrsMatrix::packAndPrepare: " << std::endl;
2453 if (* (this->localError_)) {
2454 std::ostream& err = this->markLocalErrorAndGetStream ();
2456 <<
"The target object of the Import or Export is already in an error state."
2465 std::ostringstream os;
2466 os << prefix << std::endl
2467 << prefix <<
" " << dualViewStatusToString (exportLIDs,
"exportLIDs") << std::endl
2468 << prefix <<
" " << dualViewStatusToString (exports,
"exports") << std::endl
2469 << prefix <<
" " << dualViewStatusToString (numPacketsPerLID,
"numPacketsPerLID") << std::endl;
2470 std::cerr << os.str ();
2476 if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2477 std::ostream& err = this->markLocalErrorAndGetStream ();
2479 <<
"exportLIDs.extent(0) = " << exportLIDs.extent (0)
2480 <<
" != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2481 <<
"." << std::endl;
2484 if (exportLIDs.need_sync_host ()) {
2485 std::ostream& err = this->markLocalErrorAndGetStream ();
2486 err << prefix <<
"exportLIDs be sync'd to host." << std::endl;
2490 const this_BCRS_type* src =
dynamic_cast<const this_BCRS_type*
> (&source);
2492 std::ostream& err = this->markLocalErrorAndGetStream ();
2494 <<
"The source (input) object of the Import or "
2495 "Export is either not a BlockCrsMatrix, or does not have the right "
2496 "template parameters. checkSizes() should have caught this. "
2497 "Please report this bug to the Tpetra developers." << std::endl;
2509 constantNumPackets = 0;
2513 const size_t blockSize =
static_cast<size_t> (src->getBlockSize ());
2514 const size_t numExportLIDs = exportLIDs.extent (0);
2515 size_t numBytesPerValue(0);
2517 auto val_host = val_.getHostView(Access::ReadOnly);
2519 PackTraits<impl_scalar_type>
2527 Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2530 auto exportLIDsHost = exportLIDs.view_host();
2531 auto numPacketsPerLIDHost = numPacketsPerLID.view_host();
2532 numPacketsPerLID.modify_host ();
2534 using reducer_type = Impl::BlockCrsReducer<Impl::BlockCrsRowStruct<size_t>,host_exec>;
2535 const auto policy = Kokkos::RangePolicy<host_exec>(
size_t(0), numExportLIDs);
2536 Kokkos::parallel_reduce
2538 [=](
const size_t &i,
typename reducer_type::value_type &update) {
2539 const LO lclRow = exportLIDsHost(i);
2541 numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2543 const size_t numBytes =
2544 packRowCount<LO, GO> (numEnt, numBytesPerValue, blockSize);
2545 numPacketsPerLIDHost(i) = numBytes;
2546 update +=
typename reducer_type::value_type(numEnt, numBytes, numEnt);
2547 }, rowReducerStruct);
2553 const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2554 const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2555 const size_t maxRowLength = rowReducerStruct.maxRowLength;
2558 std::ostringstream os;
2560 <<
"totalNumBytes = " << totalNumBytes <<
", totalNumEntries = " << totalNumEntries
2562 std::cerr << os.str ();
2568 if(exports.extent(0) != totalNumBytes)
2570 const std::string oldLabel = exports.d_view.label ();
2571 const std::string newLabel = (oldLabel ==
"") ?
"exports" : oldLabel;
2572 exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2574 if (totalNumEntries > 0) {
2576 Kokkos::View<size_t*, host_exec> offset(
"offset", numExportLIDs+1);
2578 const auto policy = Kokkos::RangePolicy<host_exec>(
size_t(0), numExportLIDs+1);
2579 Kokkos::parallel_scan
2581 [=](
const size_t &i,
size_t &update,
const bool &
final) {
2582 if (
final) offset(i) = update;
2583 update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2586 if (offset(numExportLIDs) != totalNumBytes) {
2587 std::ostream& err = this->markLocalErrorAndGetStream ();
2589 <<
"At end of method, the final offset (in bytes) "
2590 << offset(numExportLIDs) <<
" does not equal the total number of bytes packed "
2591 << totalNumBytes <<
". "
2592 <<
"Please report this bug to the Tpetra developers." << std::endl;
2606 exports.modify_host();
2608 typedef Kokkos::TeamPolicy<host_exec> policy_type;
2610 policy_type(numExportLIDs, 1, 1)
2611 .set_scratch_size(0, Kokkos::PerTeam(
sizeof(GO)*maxRowLength));
2612 Kokkos::parallel_for
2614 [=](
const typename policy_type::member_type &member) {
2615 const size_t i = member.league_rank();
2616 Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2617 gblColInds(member.team_scratch(0), maxRowLength);
2619 const LO lclRowInd = exportLIDsHost(i);
2620 local_inds_host_view_type lclColInds;
2621 values_host_view_type vals;
2626 src->getLocalRowView (lclRowInd, lclColInds, vals);
2627 const size_t numEnt = lclColInds.extent(0);
2630 for (
size_t j = 0; j < numEnt; ++j)
2637 const size_t numBytes =
2638 packRowForBlockCrs<impl_scalar_type, LO, GO>
2639 (exports.view_host(),
2642 Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2649 const size_t offsetDiff = offset(i+1) - offset(i);
2650 if (numBytes != offsetDiff) {
2651 std::ostringstream os;
2653 <<
"numBytes computed from packRowForBlockCrs is different from "
2654 <<
"precomputed offset values, LID = " << i << std::endl;
2655 std::cerr << os.str ();
2663 std::ostringstream os;
2664 const bool lclSuccess = ! (* (this->localError_));
2666 << (lclSuccess ?
"succeeded" :
"FAILED")
2668 std::cerr << os.str ();
2672 template<
class Scalar,
class LO,
class GO,
class Node>
2680 Kokkos::DualView<
size_t*,
2685 using ::Tpetra::Details::Behavior;
2686 using ::Tpetra::Details::dualViewStatusToString;
2687 using ::Tpetra::Details::ProfilingRegion;
2688 using ::Tpetra::Details::PackTraits;
2691 typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2693 ProfilingRegion profile_region(
"Tpetra::BlockCrsMatrix::unpackAndCombine");
2694 const bool verbose = Behavior::verbose ();
2699 std::ostringstream os;
2700 auto map = this->graph_.getRowMap ();
2701 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
2702 const int myRank = comm.is_null () ? -1 : comm->getRank ();
2703 os <<
"Proc " << myRank <<
": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2706 os <<
"Start" << endl;
2707 std::cerr << os.str ();
2712 if (* (this->localError_)) {
2713 std::ostream& err = this->markLocalErrorAndGetStream ();
2714 std::ostringstream os;
2715 os << prefix <<
"{Im/Ex}port target is already in error." << endl;
2717 std::cerr << os.str ();
2726 if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2727 std::ostream& err = this->markLocalErrorAndGetStream ();
2728 std::ostringstream os;
2729 os << prefix <<
"importLIDs.extent(0) = " << importLIDs.extent (0)
2730 <<
" != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2733 std::cerr << os.str ();
2739 if (combineMode !=
ADD && combineMode !=
INSERT &&
2741 combineMode !=
ZERO) {
2742 std::ostream& err = this->markLocalErrorAndGetStream ();
2743 std::ostringstream os;
2745 <<
"Invalid CombineMode value " << combineMode <<
". Valid "
2746 <<
"values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2749 std::cerr << os.str ();
2755 if (this->graph_.getColMap ().is_null ()) {
2756 std::ostream& err = this->markLocalErrorAndGetStream ();
2757 std::ostringstream os;
2758 os << prefix <<
"Target matrix's column Map is null." << endl;
2760 std::cerr << os.str ();
2769 const map_type& tgtColMap = * (this->graph_.getColMap ());
2772 const size_t blockSize = this->getBlockSize ();
2773 const size_t numImportLIDs = importLIDs.extent(0);
2780 size_t numBytesPerValue(0);
2782 auto val_host = val_.getHostView(Access::ReadOnly);
2784 PackTraits<impl_scalar_type>::packValueCount
2787 const size_t maxRowNumEnt = graph_.getLocalMaxNumRowEntries ();
2788 const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2791 std::ostringstream os;
2792 os << prefix <<
"combineMode: "
2794 <<
", blockSize: " << blockSize
2795 <<
", numImportLIDs: " << numImportLIDs
2796 <<
", numBytesPerValue: " << numBytesPerValue
2797 <<
", maxRowNumEnt: " << maxRowNumEnt
2798 <<
", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2799 std::cerr << os.str ();
2802 if (combineMode ==
ZERO || numImportLIDs == 0) {
2804 std::ostringstream os;
2805 os << prefix <<
"Nothing to unpack. Done!" << std::endl;
2806 std::cerr << os.str ();
2813 if (imports.need_sync_host ()) {
2814 imports.sync_host ();
2824 if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
2825 importLIDs.need_sync_host ()) {
2826 std::ostream& err = this->markLocalErrorAndGetStream ();
2827 std::ostringstream os;
2828 os << prefix <<
"All input DualViews must be sync'd to host by now. "
2829 <<
"imports_nc.need_sync_host()="
2830 << (imports.need_sync_host () ?
"true" :
"false")
2831 <<
", numPacketsPerLID.need_sync_host()="
2832 << (numPacketsPerLID.need_sync_host () ?
"true" :
"false")
2833 <<
", importLIDs.need_sync_host()="
2834 << (importLIDs.need_sync_host () ?
"true" :
"false")
2837 std::cerr << os.str ();
2843 const auto importLIDsHost = importLIDs.view_host ();
2844 const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
2850 Kokkos::View<size_t*, host_exec> offset (
"offset", numImportLIDs+1);
2852 const auto policy = Kokkos::RangePolicy<host_exec>(
size_t(0), numImportLIDs+1);
2853 Kokkos::parallel_scan
2854 (
"Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2855 [=] (
const size_t &i,
size_t &update,
const bool &
final) {
2856 if (
final) offset(i) = update;
2857 update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2867 Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2868 errorDuringUnpack (
"errorDuringUnpack");
2869 errorDuringUnpack () = 0;
2871 using policy_type = Kokkos::TeamPolicy<host_exec>;
2872 size_t scratch_per_row =
sizeof(GO) * maxRowNumEnt +
sizeof (LO) * maxRowNumEnt + numBytesPerValue * maxRowNumScalarEnt
2875 const auto policy = policy_type (numImportLIDs, 1, 1)
2876 .set_scratch_size (0, Kokkos::PerTeam (scratch_per_row));
2877 using host_scratch_space =
typename host_exec::scratch_memory_space;
2879 using pair_type = Kokkos::pair<size_t, size_t>;
2880 Kokkos::parallel_for
2881 (
"Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2882 [=] (
const typename policy_type::member_type& member) {
2883 const size_t i = member.league_rank();
2884 Kokkos::View<GO*, host_scratch_space> gblColInds
2885 (member.team_scratch (0), maxRowNumEnt);
2886 Kokkos::View<LO*, host_scratch_space> lclColInds
2887 (member.team_scratch (0), maxRowNumEnt);
2888 Kokkos::View<impl_scalar_type*, host_scratch_space> vals
2889 (member.team_scratch (0), maxRowNumScalarEnt);
2892 const size_t offval = offset(i);
2893 const LO lclRow = importLIDsHost(i);
2894 const size_t numBytes = numPacketsPerLIDHost(i);
2895 const size_t numEnt =
2896 unpackRowCount<impl_scalar_type, LO, GO>
2897 (imports.view_host (), offval, numBytes, numBytesPerValue);
2900 if (numEnt > maxRowNumEnt) {
2901 errorDuringUnpack() = 1;
2903 std::ostringstream os;
2905 <<
"At i = " << i <<
", numEnt = " << numEnt
2906 <<
" > maxRowNumEnt = " << maxRowNumEnt
2908 std::cerr << os.str ();
2913 const size_t numScalarEnt = numEnt*blockSize*blockSize;
2914 auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2915 auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2916 auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2921 size_t numBytesOut = 0;
2924 unpackRowForBlockCrs<impl_scalar_type, LO, GO>
2925 (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
2926 Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
2927 imports.view_host(),
2928 offval, numBytes, numEnt,
2929 numBytesPerValue, blockSize);
2931 catch (std::exception& e) {
2932 errorDuringUnpack() = 1;
2934 std::ostringstream os;
2935 os << prefix <<
"At i = " << i <<
", unpackRowForBlockCrs threw: "
2936 << e.what () << endl;
2937 std::cerr << os.str ();
2942 if (numBytes != numBytesOut) {
2943 errorDuringUnpack() = 1;
2945 std::ostringstream os;
2947 <<
"At i = " << i <<
", numBytes = " << numBytes
2948 <<
" != numBytesOut = " << numBytesOut <<
"."
2950 std::cerr << os.str();
2956 for (
size_t k = 0; k < numEnt; ++k) {
2958 if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2960 std::ostringstream os;
2962 <<
"At i = " << i <<
", GID " << gidsOut(k)
2963 <<
" is not owned by the calling process."
2965 std::cerr << os.str();
2973 const LO*
const lidsRaw =
const_cast<const LO*
> (lidsOut.data ());
2974 const Scalar*
const valsRaw =
reinterpret_cast<const Scalar*
>
2976 if (combineMode ==
ADD) {
2977 numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2978 }
else if (combineMode ==
INSERT || combineMode ==
REPLACE) {
2979 numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2980 }
else if (combineMode ==
ABSMAX) {
2981 numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2984 if (
static_cast<LO
> (numEnt) != numCombd) {
2985 errorDuringUnpack() = 1;
2987 std::ostringstream os;
2988 os << prefix <<
"At i = " << i <<
", numEnt = " << numEnt
2989 <<
" != numCombd = " << numCombd <<
"."
2991 std::cerr << os.str ();
2998 if (errorDuringUnpack () != 0) {
2999 std::ostream& err = this->markLocalErrorAndGetStream ();
3000 err << prefix <<
"Unpacking failed.";
3002 err <<
" Please run again with the environment variable setting "
3003 "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
3009 std::ostringstream os;
3010 const bool lclSuccess = ! (* (this->localError_));
3012 << (lclSuccess ?
"succeeded" :
"FAILED")
3014 std::cerr << os.str ();
3018 template<
class Scalar,
class LO,
class GO,
class Node>
3022 using Teuchos::TypeNameTraits;
3023 std::ostringstream os;
3024 os <<
"\"Tpetra::BlockCrsMatrix\": { "
3025 <<
"Template parameters: { "
3026 <<
"Scalar: " << TypeNameTraits<Scalar>::name ()
3027 <<
"LO: " << TypeNameTraits<LO>::name ()
3028 <<
"GO: " << TypeNameTraits<GO>::name ()
3029 <<
"Node: " << TypeNameTraits<Node>::name ()
3031 <<
", Label: \"" << this->getObjectLabel () <<
"\""
3032 <<
", Global dimensions: ["
3033 << graph_.getDomainMap ()->getGlobalNumElements () <<
", "
3034 << graph_.getRangeMap ()->getGlobalNumElements () <<
"]"
3035 <<
", Block size: " << getBlockSize ()
3041 template<
class Scalar,
class LO,
class GO,
class Node>
3044 describe (Teuchos::FancyOStream& out,
3045 const Teuchos::EVerbosityLevel verbLevel)
const
3047 using Teuchos::ArrayRCP;
3048 using Teuchos::CommRequest;
3049 using Teuchos::FancyOStream;
3050 using Teuchos::getFancyOStream;
3051 using Teuchos::ireceive;
3052 using Teuchos::isend;
3053 using Teuchos::outArg;
3054 using Teuchos::TypeNameTraits;
3055 using Teuchos::VERB_DEFAULT;
3056 using Teuchos::VERB_NONE;
3057 using Teuchos::VERB_LOW;
3058 using Teuchos::VERB_MEDIUM;
3060 using Teuchos::VERB_EXTREME;
3062 using Teuchos::wait;
3066 const Teuchos::EVerbosityLevel vl =
3067 (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3069 if (vl == VERB_NONE) {
3074 Teuchos::OSTab tab0 (out);
3076 out <<
"\"Tpetra::BlockCrsMatrix\":" << endl;
3077 Teuchos::OSTab tab1 (out);
3079 out <<
"Template parameters:" << endl;
3081 Teuchos::OSTab tab2 (out);
3082 out <<
"Scalar: " << TypeNameTraits<Scalar>::name () << endl
3083 <<
"LO: " << TypeNameTraits<LO>::name () << endl
3084 <<
"GO: " << TypeNameTraits<GO>::name () << endl
3085 <<
"Node: " << TypeNameTraits<Node>::name () << endl;
3087 out <<
"Label: \"" << this->getObjectLabel () <<
"\"" << endl
3088 <<
"Global dimensions: ["
3089 << graph_.getDomainMap ()->getGlobalNumElements () <<
", "
3090 << graph_.getRangeMap ()->getGlobalNumElements () <<
"]" << endl;
3092 const LO blockSize = getBlockSize ();
3093 out <<
"Block size: " << blockSize << endl;
3096 if (vl >= VERB_MEDIUM) {
3097 const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3098 const int myRank = comm.getRank ();
3100 out <<
"Row Map:" << endl;
3102 getRowMap()->describe (out, vl);
3104 if (! getColMap ().is_null ()) {
3105 if (getColMap() == getRowMap()) {
3107 out <<
"Column Map: same as row Map" << endl;
3112 out <<
"Column Map:" << endl;
3114 getColMap ()->describe (out, vl);
3117 if (! getDomainMap ().is_null ()) {
3118 if (getDomainMap () == getRowMap ()) {
3120 out <<
"Domain Map: same as row Map" << endl;
3123 else if (getDomainMap () == getColMap ()) {
3125 out <<
"Domain Map: same as column Map" << endl;
3130 out <<
"Domain Map:" << endl;
3132 getDomainMap ()->describe (out, vl);
3135 if (! getRangeMap ().is_null ()) {
3136 if (getRangeMap () == getDomainMap ()) {
3138 out <<
"Range Map: same as domain Map" << endl;
3141 else if (getRangeMap () == getRowMap ()) {
3143 out <<
"Range Map: same as row Map" << endl;
3148 out <<
"Range Map: " << endl;
3150 getRangeMap ()->describe (out, vl);
3155 if (vl >= VERB_EXTREME) {
3156 const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3157 const int myRank = comm.getRank ();
3158 const int numProcs = comm.getSize ();
3161 RCP<std::ostringstream> lclOutStrPtr (
new std::ostringstream ());
3162 RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3163 FancyOStream& os = *osPtr;
3164 os <<
"Process " << myRank <<
":" << endl;
3165 Teuchos::OSTab tab2 (os);
3167 const map_type& meshRowMap = * (graph_.getRowMap ());
3168 const map_type& meshColMap = * (graph_.getColMap ());
3173 os <<
"Row " << meshGblRow <<
": {";
3175 local_inds_host_view_type lclColInds;
3176 values_host_view_type vals;
3178 this->getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
3180 for (LO k = 0; k < numInds; ++k) {
3183 os <<
"Col " << gblCol <<
": [";
3184 for (LO i = 0; i < blockSize; ++i) {
3185 for (LO j = 0; j < blockSize; ++j) {
3186 os << vals[blockSize*blockSize*k + i*blockSize + j];
3187 if (j + 1 < blockSize) {
3191 if (i + 1 < blockSize) {
3196 if (k + 1 < numInds) {
3206 out << lclOutStrPtr->str ();
3207 lclOutStrPtr = Teuchos::null;
3210 const int sizeTag = 1337;
3211 const int dataTag = 1338;
3213 ArrayRCP<char> recvDataBuf;
3217 for (
int p = 1; p < numProcs; ++p) {
3220 ArrayRCP<size_t> recvSize (1);
3222 RCP<CommRequest<int> > recvSizeReq =
3223 ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3224 wait<int> (comm, outArg (recvSizeReq));
3225 const size_t numCharsToRecv = recvSize[0];
3232 if (
static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3233 recvDataBuf.resize (numCharsToRecv + 1);
3235 ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3237 RCP<CommRequest<int> > recvDataReq =
3238 ireceive<int, char> (recvData, p, dataTag, comm);
3239 wait<int> (comm, outArg (recvDataReq));
3244 recvDataBuf[numCharsToRecv] =
'\0';
3245 out << recvDataBuf.getRawPtr ();
3247 else if (myRank == p) {
3251 const std::string stringToSend = lclOutStrPtr->str ();
3252 lclOutStrPtr = Teuchos::null;
3255 const size_t numCharsToSend = stringToSend.size ();
3256 ArrayRCP<size_t> sendSize (1);
3257 sendSize[0] = numCharsToSend;
3258 RCP<CommRequest<int> > sendSizeReq =
3259 isend<int, size_t> (sendSize, 0, sizeTag, comm);
3260 wait<int> (comm, outArg (sendSizeReq));
3268 ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend,
false);
3269 RCP<CommRequest<int> > sendDataReq =
3270 isend<int, char> (sendData, 0, dataTag, comm);
3271 wait<int> (comm, outArg (sendDataReq));
3277 template<
class Scalar,
class LO,
class GO,
class Node>
3278 Teuchos::RCP<const Teuchos::Comm<int> >
3282 return graph_.getComm();
3286 template<
class Scalar,
class LO,
class GO,
class Node>
3291 return graph_.getGlobalNumCols();
3295 template<
class Scalar,
class LO,
class GO,
class Node>
3300 return graph_.getLocalNumCols();
3303 template<
class Scalar,
class LO,
class GO,
class Node>
3308 return graph_.getIndexBase();
3311 template<
class Scalar,
class LO,
class GO,
class Node>
3316 return graph_.getGlobalNumEntries();
3319 template<
class Scalar,
class LO,
class GO,
class Node>
3324 return graph_.getLocalNumEntries();
3327 template<
class Scalar,
class LO,
class GO,
class Node>
3332 return graph_.getNumEntriesInGlobalRow(globalRow);
3336 template<
class Scalar,
class LO,
class GO,
class Node>
3341 return graph_.getGlobalMaxNumRowEntries();
3344 template<
class Scalar,
class LO,
class GO,
class Node>
3349 return graph_.hasColMap();
3353 template<
class Scalar,
class LO,
class GO,
class Node>
3358 return graph_.isLocallyIndexed();
3361 template<
class Scalar,
class LO,
class GO,
class Node>
3366 return graph_.isGloballyIndexed();
3369 template<
class Scalar,
class LO,
class GO,
class Node>
3374 return graph_.isFillComplete ();
3377 template<
class Scalar,
class LO,
class GO,
class Node>
3385 template<
class Scalar,
class LO,
class GO,
class Node>
3389 nonconst_global_inds_host_view_type &,
3390 nonconst_values_host_view_type &,
3393 TEUCHOS_TEST_FOR_EXCEPTION(
3394 true, std::logic_error,
"Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
3395 "This class doesn't support global matrix indexing.");
3399 template<
class Scalar,
class LO,
class GO,
class Node>
3403 global_inds_host_view_type &,
3404 values_host_view_type &)
const
3406 TEUCHOS_TEST_FOR_EXCEPTION(
3407 true, std::logic_error,
"Tpetra::BlockCrsMatrix::getGlobalRowView: "
3408 "This class doesn't support global matrix indexing.");
3412 template<
class Scalar,
class LO,
class GO,
class Node>
3416 local_inds_host_view_type &colInds,
3417 values_host_view_type &vals)
const
3419 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
3420 colInds = local_inds_host_view_type();
3421 vals = values_host_view_type();
3424 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3425 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3426 colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3428 vals = getValuesHost (localRowInd);
3432 template<
class Scalar,
class LO,
class GO,
class Node>
3436 local_inds_host_view_type &colInds,
3437 nonconst_values_host_view_type &vals)
const
3439 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
3440 colInds = nonconst_local_inds_host_view_type();
3441 vals = nonconst_values_host_view_type();
3444 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3445 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3446 colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3453 template<
class Scalar,
class LO,
class GO,
class Node>
3458 const size_t lclNumMeshRows = graph_.getLocalNumRows ();
3460 Kokkos::View<size_t*, device_type> diagOffsets (
"diagOffsets", lclNumMeshRows);
3461 graph_.getLocalDiagOffsets (diagOffsets);
3464 auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3468 auto vals_host_out = val_.getHostView(Access::ReadOnly);
3472 size_t rowOffset = 0;
3474 LO bs = getBlockSize();
3475 for(
size_t r=0; r<getLocalNumRows(); r++)
3478 offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3479 for(
int b=0; b<bs; b++)
3484 rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3490 template<
class Scalar,
class LO,
class GO,
class Node>
3493 leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& )
3495 TEUCHOS_TEST_FOR_EXCEPTION(
3496 true, std::logic_error,
"Tpetra::BlockCrsMatrix::leftScale: "
3497 "not implemented.");
3501 template<
class Scalar,
class LO,
class GO,
class Node>
3504 rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& )
3506 TEUCHOS_TEST_FOR_EXCEPTION(
3507 true, std::logic_error,
"Tpetra::BlockCrsMatrix::rightScale: "
3508 "not implemented.");
3512 template<
class Scalar,
class LO,
class GO,
class Node>
3513 Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3520 template<
class Scalar,
class LO,
class GO,
class Node>
3521 typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3525 TEUCHOS_TEST_FOR_EXCEPTION(
3526 true, std::logic_error,
"Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3527 "not implemented.");
3537#define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3538 template class BlockCrsMatrix< S, LO, GO, NODE >;
Linear algebra kernels for small dense matrices and vectors.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh,...
void exportAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Export< LO, GO, Node > &exporter) const
Import from this to the given destination matrix, and make the result fill complete.
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix's communicator.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
Scalar scalar_type
The type of entries in the matrix (that is, of each entry in each block).
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
LO local_ordinal_type
The type of local indices.
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row's entries.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
size_t getLocalNumRows() const override
get the local number of block rows
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix's values (val_).
Kokkos::View< impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
local_matrix_device_type getLocalMatrixDevice() const
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
std::string description() const override
One-line description of this object.
void importAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Import< LO, GO, Node > &importer) const
Import from this to the given destination matrix, and make the result fill complete.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row.
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
Kokkos::View< const impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
char packet_type
Implementation detail; tells.
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
global_size_t getGlobalNumRows() const override
get the global number of block rows
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
MultiVector for multiple degrees of freedom per mesh point.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
One or more distributed dense vectors.
A distributed dense vector.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value)
Replace current value at the specified location with specified values.
int local_ordinal_type
Default value of Scalar template parameter.
Kokkos::LayoutRight BlockCrsMatrixLittleBlockArrayLayout
give an option to use layoutleft
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra's ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
@ ZERO
Replace old values with zero.