46#ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
47#define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
49#include "Kokkos_UnorderedMap.hpp"
50#include "Xpetra_CrsGraphFactory.hpp"
54#include "MueLu_Aggregates_kokkos.hpp"
55#include "MueLu_AmalgamationFactory_kokkos.hpp"
56#include "MueLu_AmalgamationInfo_kokkos.hpp"
57#include "MueLu_CoarseMapFactory_kokkos.hpp"
60#include "MueLu_NullspaceFactory_kokkos.hpp"
61#include "MueLu_PerfUtils.hpp"
63#include "MueLu_Utilities_kokkos.hpp"
65#include "Xpetra_IO.hpp"
71 template<
class LocalOrdinal,
class View>
72 class ReduceMaxFunctor{
74 ReduceMaxFunctor(View view) :
view_(view) { }
76 KOKKOS_INLINE_FUNCTION
82 KOKKOS_INLINE_FUNCTION
89 KOKKOS_INLINE_FUNCTION
98 template<
class LOType,
class GOType,
class SCType,
class DeviceType,
class NspType,
class aggRowsType,
class maxAggDofSizeType,
class agg2RowMapLOType,
class statusType,
class rowsType,
class rowsAuxType,
class colsAuxType,
class valsAuxType>
99 class LocalQRDecompFunctor {
105 typedef typename DeviceType::execution_space execution_space;
106 typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
107 typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
108 typedef typename impl_ATS::magnitudeType Magnitude;
110 typedef Kokkos::View<impl_SC**,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
111 typedef Kokkos::View<impl_SC* ,typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
127 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_,
bool doQRStep_) :
141 KOKKOS_INLINE_FUNCTION
142 void operator() (
const typename Kokkos::TeamPolicy<execution_space>::member_type & thread,
size_t& nnz)
const {
143 auto agg = thread.league_rank();
148 const impl_SC one = impl_ATS::one();
149 const impl_SC two = one + one;
150 const impl_SC zero = impl_ATS::zero();
151 const auto zeroM = impl_ATS::magnitude(zero);
157 Xpetra::global_size_t offset = agg * n;
162 shared_matrix r(thread.team_shmem(), m, n);
163 for (
int j = 0; j < n; j++)
164 for (
int k = 0; k < m; k++)
168 for (
int i = 0; i < m; i++) {
169 for (
int j = 0; j < n; j++)
170 printf(
" %5.3lf ", r(i,j));
176 shared_matrix q(thread.team_shmem(), m, m);
178 bool isSingular =
false;
182 for (
int i = 0; i < m; i++) {
183 for (
int j = 0; j < m; j++)
188 for (
int k = 0; k < n; k++) {
190 Magnitude s = zeroM, norm, norm_x;
191 for (
int i = k+1; i < m; i++)
192 s += pow(impl_ATS::magnitude(r(i,k)), 2);
193 norm = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
202 norm_x = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
203 if (norm_x == zeroM) {
211 for (
int i = k; i < m; i++)
215 for (
int j = k+1; j < n; j++) {
218 for (
int i = k; i < m; i++)
219 si += r(i,k) * r(i,j);
220 for (
int i = k; i < m; i++)
221 r(i,j) -= two*si * r(i,k);
225 for (
int j = k; j < m; j++) {
228 for (
int i = k; i < m; i++)
229 si += r(i,k) * qt(i,j);
230 for (
int i = k; i < m; i++)
231 qt(i,j) -= two*si * r(i,k);
236 for (
int i = k+1; i < m; i++)
242 for (
int i = 0; i < m; i++)
243 for (
int j = 0; j < i; j++) {
244 impl_SC tmp = qt(i,j);
251 for (
int j = 0; j < n; j++)
252 for (
int k = 0; k <= j; k++)
291 for (
int j = 0; j < n; j++)
292 for (
int k = 0; k < n; k++)
296 coarseNS(offset+k,j) = (k == j ? one : zero);
299 for (
int i = 0; i < m; i++)
300 for (
int j = 0; j < n; j++)
301 q(i,j) = (j == i ? one : zero);
305 for (
int j = 0; j < m; j++) {
307 size_t rowStart =
rowsAux(localRow);
309 for (
int k = 0; k < n; k++) {
311 if (q(j,k) != zero) {
312 colsAux(rowStart+lnnz) = offset + k;
313 valsAux(rowStart+lnnz) = q(j,k);
317 rows(localRow+1) = lnnz;
323 for (
int i = 0; i < m; i++) {
324 for (
int j = 0; j < n; j++)
330 for (
int i = 0; i < aggSize; i++) {
331 for (
int j = 0; j < aggSize; j++)
332 printf(
" %5.3lf ", q(i,j));
346 for (
int j = 0; j < m; j++) {
348 size_t rowStart =
rowsAux(localRow);
350 for (
int k = 0; k < n; k++) {
351 const impl_SC qr_jk =
fineNS(localRow,k);
354 colsAux(rowStart+lnnz) = offset + k;
355 valsAux(rowStart+lnnz) = qr_jk;
359 rows(localRow+1) = lnnz;
363 for (
int j = 0; j < n; j++)
371 size_t team_shmem_size(
int )
const {
375 return shared_matrix::shmem_size(m, n) +
376 shared_matrix::shmem_size(m, m);
384 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
386 RCP<ParameterList> validParamList = rcp(
new ParameterList());
388#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
391#undef SET_VALID_ENTRY
393 validParamList->set< RCP<const FactoryBase> >(
"A", Teuchos::null,
"Generating factory of the matrix A");
394 validParamList->set< RCP<const FactoryBase> >(
"Aggregates", Teuchos::null,
"Generating factory of the aggregates");
395 validParamList->set< RCP<const FactoryBase> >(
"Nullspace", Teuchos::null,
"Generating factory of the nullspace");
396 validParamList->set< RCP<const FactoryBase> >(
"Scaled Nullspace", Teuchos::null,
"Generating factory of the scaled nullspace");
397 validParamList->set< RCP<const FactoryBase> >(
"UnAmalgamationInfo", Teuchos::null,
"Generating factory of UnAmalgamationInfo");
398 validParamList->set< RCP<const FactoryBase> >(
"CoarseMap", Teuchos::null,
"Generating factory of the coarse map");
399 validParamList->set< RCP<const FactoryBase> >(
"Coordinates", Teuchos::null,
"Generating factory of the coordinates");
402 ParameterList norecurse;
403 norecurse.disableRecursiveValidation();
404 validParamList->set<ParameterList> (
"matrixmatrix: kernel params", norecurse,
"MatrixMatrix kernel parameters");
406 return validParamList;
409 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
412 const ParameterList& pL = GetParameterList();
414 std::string nspName =
"Nullspace";
415 if(pL.isParameter(
"Nullspace name")) nspName = pL.get<std::string>(
"Nullspace name");
417 Input(fineLevel,
"A");
418 Input(fineLevel,
"Aggregates");
419 Input(fineLevel, nspName);
420 Input(fineLevel,
"UnAmalgamationInfo");
421 Input(fineLevel,
"CoarseMap");
424 pL.get<
bool>(
"tentative: build coarse coordinates") ) {
425 bTransferCoordinates_ =
true;
426 Input(fineLevel,
"Coordinates");
427 }
else if (bTransferCoordinates_) {
428 Input(fineLevel,
"Coordinates");
432 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
434 return BuildP(fineLevel, coarseLevel);
437 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
441 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
442 typedef Xpetra::MultiVectorFactory<coordinate_type,LO,GO,NO> RealValuedMultiVectorFactory;
443 const ParameterList& pL = GetParameterList();
444 std::string nspName =
"Nullspace";
445 if(pL.isParameter(
"Nullspace name")) nspName = pL.get<std::string>(
"Nullspace name");
447 auto A = Get< RCP<Matrix> > (fineLevel,
"A");
448 auto aggregates = Get< RCP<Aggregates_kokkos> > (fineLevel,
"Aggregates");
449 auto amalgInfo = Get< RCP<AmalgamationInfo_kokkos> > (fineLevel,
"UnAmalgamationInfo");
450 auto fineNullspace = Get< RCP<MultiVector> > (fineLevel, nspName);
451 auto coarseMap = Get< RCP<const Map> > (fineLevel,
"CoarseMap");
452 RCP<RealValuedMultiVector> fineCoords;
453 if(bTransferCoordinates_) {
454 fineCoords = Get< RCP<RealValuedMultiVector> >(fineLevel,
"Coordinates");
457 RCP<Matrix> Ptentative;
460 if ( aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
461 Ptentative = Teuchos::null;
462 Set(coarseLevel,
"P", Ptentative);
465 RCP<MultiVector> coarseNullspace;
466 RCP<RealValuedMultiVector> coarseCoords;
468 if(bTransferCoordinates_) {
469 ArrayView<const GO> elementAList = coarseMap->getLocalElementList();
470 GO indexBase = coarseMap->getIndexBase();
473 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
474 blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
476 Array<GO> elementList;
477 ArrayView<const GO> elementListView;
481 elementListView = elementAList;
484 auto numElements = elementAList.size() / blkSize;
486 elementList.resize(numElements);
489 for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
490 elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;
492 elementListView = elementList;
495 auto uniqueMap = fineCoords->getMap();
496 auto coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
497 elementListView, indexBase, coarseMap->getComm());
498 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
501 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
502 if (aggregates->AggregatesCrossProcessors()) {
503 auto nonUniqueMap = aggregates->GetMap();
504 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
506 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
507 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
512 auto aggGraph = aggregates->GetGraph();
513 auto numAggs = aggGraph.numRows();
515 auto fineCoordsView = fineCoords ->getDeviceLocalView(Xpetra::Access::ReadOnly);
516 auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
522 const auto dim = fineCoords->getNumVectors();
524 typename AppendTrait<
decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
525 for (
size_t j = 0; j < dim; j++) {
526 Kokkos::parallel_for(
"MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
527 KOKKOS_LAMBDA(
const LO i) {
531 auto aggregate = aggGraph.rowConst(i);
533 coordinate_type sum = 0.0;
534 for (
size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
535 sum += fineCoordsRandomView(aggregate(colID),j);
537 coarseCoordsView(i,j) = sum / aggregate.length;
543 if (!aggregates->AggregatesCrossProcessors()) {
544 if(Xpetra::Helpers<SC,LO,GO,NO>::isTpetraBlockCrs(A)) {
545 BuildPuncoupledBlockCrs(coarseLevel,A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
549 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.
GetLevelID());
553 BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
563 if (A->IsView(
"stridedMaps") ==
true)
564 Ptentative->CreateView(
"stridedMaps", A->getRowMap(
"stridedMaps"), coarseMap);
566 Ptentative->CreateView(
"stridedMaps", Ptentative->getRangeMap(), coarseMap);
568 if(bTransferCoordinates_) {
569 Set(coarseLevel,
"Coordinates", coarseCoords);
571 Set(coarseLevel,
"Nullspace", coarseNullspace);
572 Set(coarseLevel,
"P", Ptentative);
575 RCP<ParameterList> params = rcp(
new ParameterList());
576 params->set(
"printLoadBalancingInfo",
true);
581 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
583 BuildPuncoupled(
Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
584 RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
585 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
586 RCP<MultiVector>& coarseNullspace,
const int levelID)
const {
587 auto rowMap = A->getRowMap();
588 auto colMap = A->getColMap();
590 const size_t numRows = rowMap->getLocalNumElements();
591 const size_t NSDim = fineNullspace->getNumVectors();
593 typedef Kokkos::ArithTraits<SC> ATS;
594 using impl_SC =
typename ATS::val_type;
595 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
596 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
598 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
600 typename Aggregates_kokkos::local_graph_type aggGraph;
603 aggGraph = aggregates->GetGraph();
605 auto aggRows = aggGraph.row_map;
606 auto aggCols = aggGraph.entries;
614 goodMap = isGoodMap(*rowMap, *colMap);
618 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
619 "(i.e. \"matching\" row and column maps)");
628 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
630 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
631 GO globalOffset = amalgInfo->GlobalOffset();
634 auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
635 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
636 const size_t numAggregates = aggregates->GetNumAggregates();
638 int myPID = aggregates->GetMap()->getComm()->getRank();
643 typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
644 AggSizeType aggDofSizes;
646 if (stridedBlockSize == 1) {
650 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates+1);
652 auto sizesConst = aggregates->ComputeAggregateSizes();
653 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(
static_cast<size_t>(1), numAggregates+1)), sizesConst);
659 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates + 1);
661 auto nodeMap = aggregates->GetMap()->getLocalMap();
662 auto dofMap = colMap->getLocalMap();
664 Kokkos::parallel_for(
"MueLu:TentativePF:Build:compute_agg_sizes",
range_type(0,numAggregates),
665 KOKKOS_LAMBDA(
const LO agg) {
666 auto aggRowView = aggGraph.rowConst(agg);
669 for (LO colID = 0; colID < aggRowView.length; colID++) {
670 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
672 for (LO k = 0; k < stridedBlockSize; k++) {
673 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
675 if (dofMap.getLocalElement(dofGID) != INVALID)
679 aggDofSizes(agg+1) = size;
686 ReduceMaxFunctor<LO,
decltype(aggDofSizes)> reduceMax(aggDofSizes);
687 Kokkos::parallel_reduce(
"MueLu:TentativePF:Build:max_agg_size",
range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
691 Kokkos::parallel_scan(
"MueLu:TentativePF:Build:aggregate_sizes:stage1_scan",
range_type(0,numAggregates+1),
692 KOKKOS_LAMBDA(
const LO i, LO& update,
const bool& final_pass) {
693 update += aggDofSizes(i);
695 aggDofSizes(i) = update;
700 Kokkos::View<LO*, DeviceType>
agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing(
"agg2row_map_LO"), numRows);
704 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing(
"aggOffsets"), numAggregates);
705 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(
static_cast<size_t>(0), numAggregates)));
707 Kokkos::parallel_for(
"MueLu:TentativePF:Build:createAgg2RowMap",
range_type(0, vertex2AggId.extent(0)),
708 KOKKOS_LAMBDA(
const LO lnode) {
709 if (procWinner(lnode, 0) == myPID) {
711 auto aggID = vertex2AggId(lnode,0);
713 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
717 for (LO k = 0; k < stridedBlockSize; k++)
725 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
728 auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
729 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
733 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
734 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
735 typedef typename local_matrix_type::index_type::non_const_type cols_type;
736 typedef typename local_matrix_type::values_type::non_const_type vals_type;
740 typedef Kokkos::View<int[10], DeviceType> status_type;
741 status_type status(
"status");
746 const ParameterList& pL = GetParameterList();
747 const bool&
doQRStep = pL.get<
bool>(
"tentative: calculate qr");
749 GetOStream(
Runtime1) <<
"TentativePFactory : bypassing local QR phase" << std::endl;
751 GetOStream(
Warnings0) <<
"TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
754 size_t nnzEstimate = numRows * NSDim;
755 rows_type
rowsAux(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_aux_rows"), numRows+1);
756 cols_type
colsAux(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_aux_cols"), nnzEstimate);
757 vals_type
valsAux(
"Ptent_aux_vals", nnzEstimate);
758 rows_type
rows(
"Ptent_rows", numRows+1);
765 Kokkos::parallel_for(
"MueLu:TentativePF:BuildPuncoupled:for1",
range_type(0, numRows+1),
766 KOKKOS_LAMBDA(
const LO row) {
769 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:for2",
range_type(0, nnzEstimate),
770 KOKKOS_LAMBDA(
const LO j) {
787 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
790 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:main_loop", policy,
791 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
792 auto agg = thread.league_rank();
800 auto norm = impl_ATS::magnitude(zero);
805 for (
decltype(aggSize) k = 0; k < aggSize; k++) {
821 for (
decltype(aggSize) k = 0; k < aggSize; k++) {
825 rows(localRow+1) = 1;
832 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
833 Kokkos::deep_copy(statusHost, status);
834 for (
decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
836 std::ostringstream oss;
837 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
839 case 0: oss <<
"!goodMap is not implemented";
break;
840 case 1: oss <<
"fine level NS part has a zero column";
break;
846 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
847 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
848 auto agg = thread.league_rank();
857 for (
decltype(aggSize) k = 0; k < aggSize; k++) {
861 rows(localRow+1) = 1;
869 Kokkos::parallel_reduce(
"MueLu:TentativeP:CountNNZ",
range_type(0, numRows+1),
870 KOKKOS_LAMBDA(
const LO i,
size_t &nnz_count) {
871 nnz_count +=
rows(i);
888 const Kokkos::TeamPolicy<execution_space> policy(numAggregates,1);
890 decltype(aggDofSizes ),
decltype(maxAggSize),
decltype(
agg2RowMapLO),
895 Kokkos::parallel_reduce(
"MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
898 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
899 Kokkos::deep_copy(statusHost, status);
900 for (
decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
902 std::ostringstream oss;
903 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
905 case 0: oss <<
"!goodMap is not implemented";
break;
906 case 1: oss <<
"fine level NS part has a zero column";
break;
919 if (nnz != nnzEstimate) {
924 Kokkos::parallel_scan(
"MueLu:TentativePF:Build:compress_rows",
range_type(0,numRows+1),
925 KOKKOS_LAMBDA(
const LO i, LO& upd,
const bool&
final) {
935 cols = cols_type(
"Ptent_cols", nnz);
936 vals = vals_type(
"Ptent_vals", nnz);
941 Kokkos::parallel_for(
"MueLu:TentativePF:Build:compress_cols_vals",
range_type(0,numRows),
942 KOKKOS_LAMBDA(
const LO i) {
943 LO rowStart =
rows(i);
948 cols(rowStart+lnnz) =
colsAux(j);
949 vals(rowStart+lnnz) =
valsAux(j);
961 GetOStream(
Runtime1) <<
"TentativePFactory : aggregates do not cross process boundaries" << std::endl;
967 local_matrix_type lclMatrix = local_matrix_type(
"A", numRows, coarseMap->getLocalNumElements(), nnz, vals,
rows, cols);
970 RCP<ParameterList> FCparams;
971 if (pL.isSublist(
"matrixmatrix: kernel params"))
972 FCparams = rcp(
new ParameterList(pL.sublist(
"matrixmatrix: kernel params")));
974 FCparams = rcp(
new ParameterList);
977 FCparams->set(
"compute global constants", FCparams->get(
"compute global constants",
false));
978 FCparams->set(
"Timer Label", std::string(
"MueLu::TentativeP-") +
toString(levelID));
980 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
981 Ptentative = rcp(
new CrsMatrixWrap(PtentCrs));
986 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
988 BuildPuncoupledBlockCrs(
Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
989 RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
990 RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
991 RCP<MultiVector>& coarseNullspace,
const int levelID)
const {
992#ifdef HAVE_MUELU_TPETRA
1002 RCP<const Map> rowMap = A->getRowMap();
1003 RCP<const Map> rangeMap = A->getRangeMap();
1004 RCP<const Map> colMap = A->getColMap();
1006 const size_t numFineBlockRows = rowMap->getLocalNumElements();
1010 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1012 typedef Kokkos::ArithTraits<SC> ATS;
1013 using impl_SC =
typename ATS::val_type;
1014 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
1015 const impl_SC one = impl_ATS::one();
1018 const size_t NSDim = fineNullspace->getNumVectors();
1019 auto aggSizes = aggregates->ComputeAggregateSizes();
1022 typename Aggregates_kokkos::local_graph_type aggGraph;
1025 aggGraph = aggregates->GetGraph();
1027 auto aggRows = aggGraph.row_map;
1028 auto aggCols = aggGraph.entries;
1035 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1036 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1037 Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
1039 coarsePointMap->getIndexBase(),
1040 coarsePointMap->getComm());
1042 const ParameterList& pL = GetParameterList();
1053 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1054 "(i.e. \"matching\" row and column maps)");
1063 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1065 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1069 auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
1070 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1071 const size_t numAggregates = aggregates->GetNumAggregates();
1073 int myPID = aggregates->GetMap()->getComm()->getRank();
1078 typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
1079 AggSizeType aggDofSizes;
1085 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates+1);
1087 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(
static_cast<size_t>(1), numAggregates+1)), aggSizes);
1093 ReduceMaxFunctor<LO,
decltype(aggDofSizes)> reduceMax(aggDofSizes);
1094 Kokkos::parallel_reduce(
"MueLu:TentativePF:Build:max_agg_size",
range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1098 Kokkos::parallel_scan(
"MueLu:TentativePF:Build:aggregate_sizes:stage1_scan",
range_type(0,numAggregates+1),
1099 KOKKOS_LAMBDA(
const LO i, LO& update,
const bool& final_pass) {
1100 update += aggDofSizes(i);
1102 aggDofSizes(i) = update;
1107 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing(
"aggtorow_map_LO"), numFineBlockRows);
1111 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing(
"aggOffsets"), numAggregates);
1112 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(
static_cast<size_t>(0), numAggregates)));
1114 Kokkos::parallel_for(
"MueLu:TentativePF:Build:createAgg2RowMap",
range_type(0, vertex2AggId.extent(0)),
1115 KOKKOS_LAMBDA(
const LO lnode) {
1116 if (procWinner(lnode, 0) == myPID) {
1118 auto aggID = vertex2AggId(lnode,0);
1120 auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
1124 for (LO k = 0; k < stridedBlockSize; k++)
1125 aggToRowMapLO(offset + k) = lnode*stridedBlockSize + k;
1132 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1135 auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
1136 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1138 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
1139 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1140 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1145 typedef Kokkos::View<int[10], DeviceType> status_type;
1146 status_type status(
"status");
1152 GetOStream(
Runtime1) <<
"TentativePFactory : bypassing local QR phase" << std::endl;
1158 rows_type ia(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_rowptr"), numFineBlockRows+1);
1159 cols_type ja(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_colind"), numFineBlockRows);
1161 Kokkos::parallel_for(
"MueLu:TentativePF:BlockCrs:graph_init",
range_type(0, numFineBlockRows),
1162 KOKKOS_LAMBDA(
const LO j) {
1166 if(j==(LO)numFineBlockRows-1)
1167 ia[numFineBlockRows] = numFineBlockRows;
1171 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1172 Kokkos::parallel_for(
"MueLu:TentativePF:BlockCrs:fillGraph", policy,
1173 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
1174 auto agg = thread.league_rank();
1175 Xpetra::global_size_t offset = agg;
1180 for (LO j = 0; j < aggSize; j++) {
1182 const LO localRow = aggToRowMapLO[aggDofSizes[agg]+j];
1183 const size_t rowStart = ia[localRow];
1184 ja[rowStart] = offset;
1194 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_rowptr"), numFineBlockRows+1);
1196 Kokkos::parallel_scan(
"MueLu:TentativePF:BlockCrs:compress_rows",
range_type(0,numFineBlockRows),
1197 KOKKOS_LAMBDA(
const LO i, LO& upd,
const bool&
final) {
1200 for (
auto j = ia[i]; j < ia[i+1]; j++)
1201 if (ja[j] != INVALID)
1203 if(
final && i == (LO) numFineBlockRows-1)
1204 i_temp[numFineBlockRows] = upd;
1207 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_colind"), nnz);
1210 Kokkos::parallel_for(
"MueLu:TentativePF:BlockCrs:compress_cols",
range_type(0,numFineBlockRows),
1211 KOKKOS_LAMBDA(
const LO i) {
1212 size_t rowStart = i_temp[i];
1214 for (
auto j = ia[i]; j < ia[i+1]; j++)
1215 if (ja[j] != INVALID) {
1216 j_temp[rowStart+lnnz] = ja[j];
1225 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap,coarseBlockMap,ia,ja);
1230 RCP<ParameterList> FCparams;
1231 if(pL.isSublist(
"matrixmatrix: kernel params"))
1232 FCparams=rcp(
new ParameterList(pL.sublist(
"matrixmatrix: kernel params")));
1234 FCparams= rcp(
new ParameterList);
1236 FCparams->set(
"compute global constants",FCparams->get(
"compute global constants",
false));
1237 std::string levelIDs =
toString(levelID);
1238 FCparams->set(
"Timer Label",std::string(
"MueLu::TentativeP-")+levelIDs);
1239 RCP<const Export> dummy_e;
1240 RCP<const Import> dummy_i;
1241 BlockGraph->expertStaticFillComplete(coarseBlockMap,rowMap,dummy_i,dummy_e,FCparams);
1252 RCP<Xpetra::CrsMatrix<SC,LO,GO,NO> > P_xpetra = Xpetra::CrsMatrixFactory<SC,LO,GO,NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap,NSDim);
1253 RCP<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> > P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC,LO,GO,NO> >(P_xpetra);
1254 if(P_tpetra.is_null())
throw std::runtime_error(
"BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1255 RCP<CrsMatrixWrap> P_wrap = rcp(
new CrsMatrixWrap(P_xpetra));
1257 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1258 const LO stride = NSDim*NSDim;
1260 Kokkos::parallel_for(
"MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1261 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
1262 auto agg = thread.league_rank();
1266 Xpetra::global_size_t offset = agg*NSDim;
1269 for (LO j = 0; j < aggSize; j++) {
1270 LO localBlockRow = aggToRowMapLO(
aggRows(agg)+j);
1271 LO rowStart = localBlockRow * stride;
1272 for (LO r = 0; r < (LO)NSDim; r++) {
1273 LO localPointRow = localBlockRow*NSDim + r;
1274 for (LO c = 0; c < (LO)NSDim; c++) {
1275 values[rowStart + r*NSDim + c] = fineNSRandom(localPointRow,c);
1281 for(LO j=0; j<(LO)NSDim; j++)
1285 Ptentative = P_wrap;
1288 throw std::runtime_error(
"TentativePFactory::BuildPuncoupledBlockCrs: Requires Tpetra");
1292 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
1294 BuildPcoupled(RCP<Matrix> , RCP<Aggregates_kokkos> ,
1295 RCP<AmalgamationInfo_kokkos> , RCP<MultiVector> ,
1296 RCP<const Map> , RCP<Matrix>& ,
1297 RCP<MultiVector>& )
const {
1301 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
1303 isGoodMap(
const Map& rowMap,
const Map& colMap)
const {
1304 auto rowLocalMap = rowMap.getLocalMap();
1305 auto colLocalMap = colMap.getLocalMap();
1307 const size_t numRows = rowLocalMap.getLocalNumElements();
1308 const size_t numCols = colLocalMap.getLocalNumElements();
1310 if (numCols < numRows)
1314 Kokkos::parallel_reduce(
"MueLu:TentativePF:isGoodMap",
range_type(0, numRows),
1315 KOKKOS_LAMBDA(
const LO i,
size_t &diff) {
1316 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1319 return (numDiff == 0);
1324#define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
#define SET_VALID_ENTRY(name)
maxAggDofSizeType maxAggDofSize
agg2RowMapLOType agg2RowMapLO
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
Class that holds all level-specific information.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.
int GetLevelID() const
Return level number.
static const NoFactory * get()
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
Namespace for MueLu classes and methods.
@ Warnings0
Important warning messages (one line)
@ Statistics2
Print even more statistics.
@ Runtime1
Description of what is happening (more verbose)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.