49#ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
50#define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
55#include "Teuchos_TimeMonitor.hpp"
64 template<
class Scalar,
class DeviceType,
int integralViewRank>
67 using ExecutionSpace =
typename DeviceType::execution_space;
68 using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
69 using TeamMember =
typename TeamPolicy::member_type;
71 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
72 IntegralViewType integralView_;
79 int leftComponentSpan_;
80 int rightComponentSpan_;
81 int numTensorComponents_;
82 int leftFieldOrdinalOffset_;
83 int rightFieldOrdinalOffset_;
84 bool forceNonSpecialized_;
86 size_t fad_size_output_ = 0;
88 Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
92 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
93 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
94 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
96 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_;
97 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
110 int leftFieldOrdinalOffset,
111 int rightFieldOrdinalOffset,
112 bool forceNonSpecialized)
114 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
115 leftComponent_(leftComponent),
116 composedTransform_(composedTransform),
117 rightComponent_(rightComponent),
118 cellMeasures_(cellMeasures),
121 leftComponentSpan_(leftComponent.
extent_int(2)),
122 rightComponentSpan_(rightComponent.
extent_int(2)),
124 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
125 rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
126 forceNonSpecialized_(forceNonSpecialized)
128 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.
numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
131 const int FIELD_DIM = 0;
132 const int POINT_DIM = 1;
136 for (
int r=0; r<numTensorComponents_; r++)
139 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
141 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
143 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
147 int leftRelativeEnumerationSpan = 1;
148 int rightRelativeEnumerationSpan = 1;
149 for (
int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
151 leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
152 rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
153 leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
154 rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
160 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
161 if (allocateFadStorage)
166 const int R = numTensorComponents_ - 1;
169 int allocationSoFar = 0;
170 offsetsForComponentOrdinal_[R] = allocationSoFar;
173 for (
int r=R-1; r>0; r--)
178 num_ij *= leftFields * rightFields;
180 offsetsForComponentOrdinal_[r] = allocationSoFar;
181 allocationSoFar += num_ij;
183 offsetsForComponentOrdinal_[0] = allocationSoFar;
186 template<
size_t maxComponents,
size_t numComponents = maxComponents>
187 KOKKOS_INLINE_FUNCTION
188 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
189 const Kokkos::Array<int,maxComponents> &bounds)
const
191 if (numComponents == 0)
197 int r =
static_cast<int>(numComponents - 1);
198 while (arguments[r] + 1 >= bounds[r])
204 if (r >= 0) ++arguments[r];
210 KOKKOS_INLINE_FUNCTION
212 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
213 const int &numComponents)
const
215 if (numComponents == 0)
return -1;
216 int r =
static_cast<int>(numComponents - 1);
217 while (arguments[r] + 1 >= bounds[r])
223 if (r >= 0) ++arguments[r];
227 template<
size_t maxComponents,
size_t numComponents = maxComponents>
228 KOKKOS_INLINE_FUNCTION
229 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &arguments,
230 const Kokkos::Array<int,maxComponents> &bounds)
const
232 if (numComponents == 0)
238 int r =
static_cast<int>(numComponents - 1);
239 while (arguments[r] + 1 >= bounds[r])
249 KOKKOS_INLINE_FUNCTION
251 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
252 const int &numComponents)
const
254 if (numComponents == 0)
return -1;
255 int r = numComponents - 1;
256 while (arguments[r] + 1 >= bounds[r])
264 template<
size_t maxComponents,
size_t numComponents = maxComponents>
265 KOKKOS_INLINE_FUNCTION
266 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &arguments,
267 const Kokkos::Array<int,maxComponents> &bounds,
268 const int startIndex)
const
271 if (numComponents == 0)
277 int enumerationIndex = 0;
278 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
280 enumerationIndex += arguments[r];
281 enumerationIndex *= bounds[r-1];
283 enumerationIndex += arguments[startIndex];
284 return enumerationIndex;
298 KOKKOS_INLINE_FUNCTION
301 constexpr int numTensorComponents = 3;
303 Kokkos::Array<int,numTensorComponents> pointBounds;
304 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
305 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
306 for (
unsigned r=0; r<numTensorComponents; r++)
308 pointBounds[r] = pointBounds_[r];
309 leftFieldBounds[r] = leftFieldBounds_[r];
310 rightFieldBounds[r] = rightFieldBounds_[r];
313 const int cellDataOrdinal = teamMember.league_rank();
314 const int numThreads = teamMember.team_size();
315 const int scratchViewSize = offsetsForComponentOrdinal_[0];
317 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
318 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
319 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z;
320 if (fad_size_output_ > 0) {
321 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
322 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
323 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
324 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
325 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
326 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
327 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
328 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
331 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
332 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
333 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
334 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
335 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
336 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
337 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
338 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
344 constexpr int R = numTensorComponents - 1;
346 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
348 const int a = a_offset_ + a_component;
349 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
351 const int b = b_offset_ + b_component;
356 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
357 const int numRightFieldsFinal = rightFinalComponent.
extent_int(0);
366 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
367 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (
const int& fieldOrdinalPointOrdinal) {
368 const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
369 const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
370 if ((fieldOrdinal < leftTensorComponent_x.
extent_int(0)) && (pointOrdinal < leftTensorComponent_x.
extent_int(1)))
372 const int leftRank = leftTensorComponent_x.rank();
373 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
375 if ((fieldOrdinal < leftTensorComponent_y.
extent_int(0)) && (pointOrdinal < leftTensorComponent_y.
extent_int(1)))
377 const int leftRank = leftTensorComponent_y.rank();
378 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
380 if ((fieldOrdinal < leftTensorComponent_z.
extent_int(0)) && (pointOrdinal < leftTensorComponent_z.
extent_int(1)))
382 const int leftRank = leftTensorComponent_z.rank();
383 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
385 if ((fieldOrdinal < rightTensorComponent_x.
extent_int(0)) && (pointOrdinal < rightTensorComponent_x.
extent_int(1)))
387 const int rightRank = rightTensorComponent_x.rank();
388 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
390 if ((fieldOrdinal < rightTensorComponent_y.
extent_int(0)) && (pointOrdinal < rightTensorComponent_y.
extent_int(1)))
392 const int rightRank = rightTensorComponent_y.rank();
393 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
395 if ((fieldOrdinal < rightTensorComponent_z.
extent_int(0)) && (pointOrdinal < rightTensorComponent_z.
extent_int(1)))
397 const int rightRank = rightTensorComponent_z.rank();
398 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
405 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (
const int& pointOrdinal) {
406 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
411 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
412 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
419 teamMember.team_barrier();
422 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
423 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
425 scratchView(i) = 0.0;
429 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
430 const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
431 const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
435 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
436 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
437 rightFieldArguments[R] = jz;
438 leftFieldArguments[R] = iz;
440 Kokkos::Array<int,numTensorComponents> pointArguments;
441 for (
int i=0; i<numTensorComponents; i++)
443 pointArguments[i] = 0;
446 for (
int lx=0; lx<pointBounds[0]; lx++)
448 pointArguments[0] = lx;
452 const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
453 const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
455 for (
int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
457 scratchView(Gy_index) = 0;
460 for (
int ly=0; ly<pointBounds[1]; ly++)
462 pointArguments[1] = ly;
464 Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
467 for (
int lz=0; lz < pointBounds[2]; lz++)
469 const Scalar & leftValue = leftFields_z(iz,lz);
470 const Scalar & rightValue = rightFields_z(jz,lz);
472 pointArguments[2] = lz;
473 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
475 *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
480 for (
int iy=0; iy<leftFieldBounds_[1]; iy++)
482 leftFieldArguments[1] = iy;
483 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
485 const Scalar & leftValue = leftFields_y(iy,ly);
487 for (
int jy=0; jy<rightFieldBounds_[1]; jy++)
489 rightFieldArguments[1] = jy;
491 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
492 const Scalar & rightValue = rightFields_y(jy,ly);
494 const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
495 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
497 const int Gz_index = 0;
498 const Scalar & Gz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
500 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
502 Gy += leftValue * Gz * rightValue;
507 for (
int ix=0; ix<leftFieldBounds_[0]; ix++)
509 leftFieldArguments[0] = ix;
510 const Scalar & leftValue = leftFields_x(ix,lx);
512 for (
int iy=0; iy<leftFieldBounds_[1]; iy++)
514 leftFieldArguments[1] = iy;
516 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
518 for (
int jx=0; jx<rightFieldBounds_[0]; jx++)
520 rightFieldArguments[0] = jx;
521 const Scalar & rightValue = rightFields_x(jx,lx);
523 for (
int jy=0; jy<rightFieldBounds_[1]; jy++)
525 rightFieldArguments[1] = jy;
526 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
528 const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
530 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
531 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
534 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
535 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
536 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
537 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
539 if (integralViewRank == 3)
561 integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
566 integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
580 template<
size_t numTensorComponents>
581 KOKKOS_INLINE_FUNCTION
582 void run(
const TeamMember & teamMember )
const
584 Kokkos::Array<int,numTensorComponents> pointBounds;
585 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
586 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
587 for (
unsigned r=0; r<numTensorComponents; r++)
589 pointBounds[r] = pointBounds_[r];
590 leftFieldBounds[r] = leftFieldBounds_[r];
591 rightFieldBounds[r] = rightFieldBounds_[r];
594 const int cellDataOrdinal = teamMember.league_rank();
595 const int numThreads = teamMember.team_size();
596 const int scratchViewSize = offsetsForComponentOrdinal_[0];
597 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
598 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
599 Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields;
600 if (fad_size_output_ > 0) {
601 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
602 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
603 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
604 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
607 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
608 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
609 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
610 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
616 constexpr int R = numTensorComponents - 1;
618 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
620 const int a = a_offset_ + a_component;
621 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
623 const int b = b_offset_ + b_component;
625 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.
getTensorComponent(R);
626 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.
getTensorComponent(R);
628 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
629 const int numRightFieldsFinal = rightFinalComponent.extent_int(0);
631 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (
const int& r) {
632 const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.
getTensorComponent(r);
633 const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.
getTensorComponent(r);
634 const int leftFieldCount = leftTensorComponent.
extent_int(0);
635 const int pointCount = leftTensorComponent.extent_int(1);
636 const int leftRank = leftTensorComponent.rank();
637 const int rightFieldCount = rightTensorComponent.extent_int(0);
638 const int rightRank = rightTensorComponent.rank();
639 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
642 const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
643 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
645 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646 leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
649 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
652 const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
653 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
655 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
656 rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
661 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
662 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
667 teamMember.team_barrier();
669 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
670 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
671 const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
672 const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
676 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
677 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
678 rightFieldArguments[R] = j_R;
679 leftFieldArguments[R] = i_R;
682 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
684 scratchView(i) = 0.0;
686 Kokkos::Array<int,numTensorComponents> pointArguments;
687 for (
unsigned i=0; i<numTensorComponents; i++)
689 pointArguments[i] = 0;
696 const int pointBounds_R = pointBounds[R];
697 int & pointArgument_R = pointArguments[R];
698 for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
703 G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
707 const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
708 const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
710 if (integralViewRank == 3)
713 G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
718 G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
722 const int & pointIndex_R = pointArguments[R];
724 const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
725 const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
727 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
729 *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
734 const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
735 const int r_min = (r_next >= 0) ? r_next : 0;
737 for (
int s=R-1; s>=r_min; s--)
739 const int & pointIndex_s = pointArguments[s];
742 for (
int s1=s; s1<R; s1++)
744 leftFieldArguments[s1] = 0;
748 const int & i_s = leftFieldArguments[s];
749 const int & j_s = rightFieldArguments[s];
752 const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
753 const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
757 const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
760 const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
762 const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
764 for (
int s1=s; s1<R; s1++)
766 rightFieldArguments[s1] = 0;
771 const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
774 const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
776 const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
778 const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
784 const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
786 const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
791 G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
796 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
797 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
798 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
799 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
801 if (integralViewRank == 3)
804 G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
809 G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
813 *G_s += leftValue * G_sp * rightValue;
818 sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
822 sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
829 const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
830 for (
int i=scratchOffsetForThread; i<endIndex; i++)
832 scratchView(i) = 0.0;
839 r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
847 KOKKOS_INLINE_FUNCTION
848 void operator()(
const TeamMember & teamMember )
const
850 switch (numTensorComponents_)
852 case 1: run<1>(teamMember);
break;
853 case 2: run<2>(teamMember);
break;
855 if (forceNonSpecialized_)
860 case 4: run<4>(teamMember);
break;
861 case 5: run<5>(teamMember);
break;
862 case 6: run<6>(teamMember);
break;
863 case 7: run<7>(teamMember);
break;
864#ifdef INTREPID2_HAVE_DEBUG
877 const int R = numTensorComponents_ - 1;
883 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
885 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
890 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
891 const int numRightFieldsFinal = rightFinalComponent.
extent_int(0);
893 flopCount += flopsPerPoint_ab * cellMeasures_.
extent_int(1);
895 int flopsPer_i_R_j_R = 0;
899 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
900 leftFieldArguments[R] = 0;
902 Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
903 for (
int i=0; i<numTensorComponents_; i++)
905 pointArguments[i] = 0;
910 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
911 rightFieldArguments[R] = 0;
917 for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
919 flopsPer_i_R_j_R += 4;
927 const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
928 const int r_min = (r_next >= 0) ? r_next : 0;
930 for (
int s=R-1; s>=r_min; s--)
933 int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
934 int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
936 flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
940 r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
943 flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
951 int teamSize(
const int &maxTeamSizeFromKokkos)
const
953 const int R = numTensorComponents_ - 1;
954 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
955 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
962 size_t shmem_size = 0;
964 if (fad_size_output_ > 0)
966 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
967 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.
extent_int(1), fad_size_output_);
968 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
969 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
973 shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
974 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.
extent_int(1));
975 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
976 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
992 template<
class Scalar,
class DeviceType,
int integralViewRank>
995 using ExecutionSpace =
typename DeviceType::execution_space;
996 using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
997 using TeamMember =
typename TeamPolicy::member_type;
999 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
1000 IntegralViewType integralView_;
1007 int leftComponentSpan_;
1008 int rightComponentSpan_;
1009 int numTensorComponents_;
1010 int leftFieldOrdinalOffset_;
1011 int rightFieldOrdinalOffset_;
1013 size_t fad_size_output_ = 0;
1017 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1018 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1019 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1022 int maxFieldsRight_;
1032 int leftFieldOrdinalOffset,
1033 int rightFieldOrdinalOffset)
1035 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1036 leftComponent_(leftComponent),
1037 composedTransform_(composedTransform),
1038 rightComponent_(rightComponent),
1039 cellMeasures_(cellMeasures),
1040 a_offset_(a_offset),
1041 b_offset_(b_offset),
1042 leftComponentSpan_(leftComponent.
extent_int(2)),
1043 rightComponentSpan_(rightComponent.
extent_int(2)),
1045 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1046 rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1048 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.
numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
1050 const int FIELD_DIM = 0;
1051 const int POINT_DIM = 1;
1053 maxFieldsRight_ = 0;
1055 for (
int r=0; r<numTensorComponents_; r++)
1058 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1060 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1062 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1068 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
1069 if (allocateFadStorage)
1075 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1076 KOKKOS_INLINE_FUNCTION
1077 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1078 const Kokkos::Array<int,maxComponents> &bounds)
const
1080 if (numComponents == 0)
return -1;
1081 int r =
static_cast<int>(numComponents - 1);
1082 while (arguments[r] + 1 >= bounds[r])
1088 if (r >= 0) ++arguments[r];
1093 KOKKOS_INLINE_FUNCTION
1095 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1096 const int &numComponents)
const
1098 if (numComponents == 0)
return -1;
1099 int r =
static_cast<int>(numComponents - 1);
1100 while (arguments[r] + 1 >= bounds[r])
1106 if (r >= 0) ++arguments[r];
1110 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1111 KOKKOS_INLINE_FUNCTION
1112 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &arguments,
1113 const Kokkos::Array<int,maxComponents> &bounds)
const
1115 if (numComponents == 0)
return -1;
1116 int r =
static_cast<int>(numComponents - 1);
1117 while (arguments[r] + 1 >= bounds[r])
1126 KOKKOS_INLINE_FUNCTION
1128 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1129 const int &numComponents)
const
1131 if (numComponents == 0)
return -1;
1132 int r = numComponents - 1;
1133 while (arguments[r] + 1 >= bounds[r])
1141 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1142 KOKKOS_INLINE_FUNCTION
1143 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &arguments,
1144 const Kokkos::Array<int,maxComponents> &bounds,
1145 const int startIndex)
const
1148 if (numComponents == 0)
1152 int enumerationIndex = 0;
1153 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
1155 enumerationIndex += arguments[r];
1156 enumerationIndex *= bounds[r-1];
1158 enumerationIndex += arguments[startIndex];
1159 return enumerationIndex;
1163 KOKKOS_INLINE_FUNCTION
1164 enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1165 integralViewEntry(
const IntegralViewType& integralView,
const int &cellDataOrdinal,
const int &i,
const int &j)
const
1167 return integralView(cellDataOrdinal,i,j);
1171 KOKKOS_INLINE_FUNCTION
1172 enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1173 integralViewEntry(
const IntegralViewType& integralView,
const int &cellDataOrdinal,
const int &i,
const int &j)
const
1175 return integralView(i,j);
1179 KOKKOS_INLINE_FUNCTION
1182 constexpr int numTensorComponents = 3;
1184 const int pointBounds_x = pointBounds_[0];
1185 const int pointBounds_y = pointBounds_[1];
1186 const int pointBounds_z = pointBounds_[2];
1187 const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1189 const int leftFieldBounds_x = leftFieldBounds_[0];
1190 const int rightFieldBounds_x = rightFieldBounds_[0];
1191 const int leftFieldBounds_y = leftFieldBounds_[1];
1192 const int rightFieldBounds_y = rightFieldBounds_[1];
1193 const int leftFieldBounds_z = leftFieldBounds_[2];
1194 const int rightFieldBounds_z = rightFieldBounds_[2];
1196 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1197 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1198 for (
unsigned r=0; r<numTensorComponents; r++)
1200 leftFieldBounds[r] = leftFieldBounds_[r];
1201 rightFieldBounds[r] = rightFieldBounds_[r];
1204 const auto integralView = integralView_;
1205 const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1206 const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1208 const int cellDataOrdinal = teamMember.league_rank();
1209 const int threadNumber = teamMember.team_rank();
1211 const int numThreads = teamMember.team_size();
1212 const int GyEntryCount = pointBounds_z;
1213 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals;
1214 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals;
1215 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GzIntegral;
1216 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
1218 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1219 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1220 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1221 if (fad_size_output_ > 0) {
1222 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1223 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1224 GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads, fad_size_output_);
1225 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
1227 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1228 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1229 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1230 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1231 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1232 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1235 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1236 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1237 GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads);
1238 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
1240 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1241 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1242 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1243 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1244 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1245 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1254 teamMember.team_barrier();
1256 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
1258 const int a = a_offset_ + a_component;
1259 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
1261 const int b = b_offset_ + b_component;
1270 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1271 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (
const int& fieldOrdinal) {
1272 if (fieldOrdinal < leftTensorComponent_x.
extent_int(0))
1274 const int pointCount = leftTensorComponent_x.extent_int(1);
1275 const int leftRank = leftTensorComponent_x.rank();
1276 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1278 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1281 if (fieldOrdinal < leftTensorComponent_y.
extent_int(0))
1283 const int pointCount = leftTensorComponent_y.extent_int(1);
1284 const int leftRank = leftTensorComponent_y.rank();
1285 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1287 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1290 if (fieldOrdinal < leftTensorComponent_z.extent_int(0))
1292 const int pointCount = leftTensorComponent_z.extent_int(1);
1293 const int leftRank = leftTensorComponent_z.rank();
1294 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1296 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1299 if (fieldOrdinal < rightTensorComponent_x.extent_int(0))
1301 const int pointCount = rightTensorComponent_x.extent_int(1);
1302 const int rightRank = rightTensorComponent_x.rank();
1303 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1305 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1308 if (fieldOrdinal < rightTensorComponent_y.extent_int(0))
1310 const int pointCount = rightTensorComponent_y.extent_int(1);
1311 const int rightRank = rightTensorComponent_y.rank();
1312 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1314 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1317 if (fieldOrdinal < rightTensorComponent_z.extent_int(0))
1319 const int pointCount = rightTensorComponent_z.extent_int(1);
1320 const int rightRank = rightTensorComponent_z.rank();
1321 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1323 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1328 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
1329 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1333 teamMember.team_barrier();
1335 for (
int i0=0; i0<leftFieldBounds_x; i0++)
1337 for (
int j0=0; j0<rightFieldBounds_x; j0++)
1339 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (
const int& pointEnumerationIndexLaterDimensions) {
1341 const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions;
1343 Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1346 if (fad_size_output_ == 0)
1349 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (
const int &x_pointOrdinal, Scalar &integralThusFar)
1351 integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1356 for (
int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1358 Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1364 teamMember.team_barrier();
1366 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (
const int& i1j1) {
1367 const int i1 = i1j1 % leftFieldBounds_y;
1368 const int j1 = i1j1 / leftFieldBounds_y;
1370 int Gy_index = GyEntryCount * threadNumber;
1372 int pointEnumerationIndex = 0;
1373 for (
int lz=0; lz<pointBounds_z; lz++)
1375 Scalar & Gy = GyIntegrals(Gy_index);
1378 for (
int ly=0; ly<pointBounds_y; ly++)
1380 const Scalar & leftValue = leftFields_y(i1,ly);
1381 const Scalar & rightValue = rightFields_y(j1,ly);
1383 Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1385 pointEnumerationIndex++;
1390 Scalar & Gz = GzIntegral(threadNumber);
1391 for (
int i2=0; i2<leftFieldBounds_z; i2++)
1393 for (
int j2=0; j2<rightFieldBounds_z; j2++)
1397 int Gy_index = GyEntryCount * threadNumber;
1399 for (
int lz=0; lz<pointBounds_z; lz++)
1401 const Scalar & leftValue = leftFields_z(i2,lz);
1402 const Scalar & rightValue = rightFields_z(j2,lz);
1404 Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1409 const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1410 const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1415 integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1420 teamMember.team_barrier();
1427 template<
size_t numTensorComponents>
1428 KOKKOS_INLINE_FUNCTION
1429 void run(
const TeamMember & teamMember )
const
1567 KOKKOS_INLINE_FUNCTION
1568 void operator()(
const TeamMember & teamMember )
const
1570 switch (numTensorComponents_)
1572 case 1: run<1>(teamMember);
break;
1573 case 2: run<2>(teamMember);
break;
1574 case 3: runSpecialized3(teamMember);
break;
1575 case 4: run<4>(teamMember);
break;
1576 case 5: run<5>(teamMember);
break;
1577 case 6: run<6>(teamMember);
break;
1578 case 7: run<7>(teamMember);
break;
1579#ifdef INTREPID2_HAVE_DEBUG
1592 constexpr int numTensorComponents = 3;
1593 Kokkos::Array<int,numTensorComponents> pointBounds;
1594 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1595 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1597 int pointsInNonzeroComponentDimensions = 1;
1598 for (
unsigned r=0; r<numTensorComponents; r++)
1600 pointBounds[r] = pointBounds_[r];
1601 if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1602 leftFieldBounds[r] = leftFieldBounds_[r];
1603 rightFieldBounds[r] = rightFieldBounds_[r];
1606 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
1608 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
1615 for (
int i0=0; i0<leftFieldBounds[0]; i0++)
1617 for (
int j0=0; j0<rightFieldBounds[0]; j0++)
1619 flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3;
1659 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3;
1660 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3;
1661 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1;
1737 const int R = numTensorComponents_ - 1;
1738 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1739 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1746 size_t shmem_size = 0;
1748 const int GyEntryCount = pointBounds_[2];
1750 int pointsInNonzeroComponentDimensions = 1;
1751 for (
int d=1; d<numTensorComponents_; d++)
1753 pointsInNonzeroComponentDimensions *= pointBounds_[d];
1756 if (fad_size_output_ > 0)
1758 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_);
1759 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_);
1760 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads, fad_size_output_);
1761 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.
extent_int(1), fad_size_output_);
1763 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_);
1764 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_);
1765 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_);
1766 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_);
1767 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_);
1768 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_);
1772 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions);
1773 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads);
1774 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads);
1775 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.
extent_int(1));
1777 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]);
1778 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]);
1779 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]);
1780 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]);
1781 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]);
1782 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]);
1790template<
typename DeviceType>
1791template<
class Scalar>
1800 const bool leftHasOrdinalFilter = vectorDataLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1801 const bool rightHasOrdinalFilter = vectorDataRight.basisValues().ordinalFilter().extent_int(0) > 0;
1802 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument,
"Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1808 const int CELL_DIM = 0;
1810 const auto leftTransform = vectorDataLeft.
transform();
1812 DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1817 int cellDataExtent = combinedCellDimInfo.dataExtent;
1819 const int numCells = vectorDataLeft.
numCells();
1820 const int numFieldsLeft = vectorDataLeft.
numFields();
1821 const int numFieldsRight = vectorDataRight.
numFields();
1823 Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1824 Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,
GENERAL,
GENERAL};
1828 Kokkos::View<Scalar***,DeviceType> data(
"Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1833 Kokkos::View<Scalar**,DeviceType> data(
"Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1841template<
typename DeviceType>
1842template<
class Scalar>
1846 double* approximateFlops)
1848 using ExecutionSpace =
typename DeviceType::execution_space;
1852 const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1853 const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
1854 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument,
"Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1856 if (approximateFlops != NULL)
1858 *approximateFlops = 0;
1870 const int numFieldsLeft = basisValuesLeft.
numFields();
1871 const int numFieldsRight = basisValuesRight.
numFields();
1872 const int spaceDim = basisValuesLeft.
spaceDim();
1874 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.
spaceDim() != basisValuesRight.
spaceDim(), std::invalid_argument,
"vectorDataLeft and vectorDataRight must agree on the space dimension");
1876 const int leftFamilyCount = basisValuesLeft.basisValues().
numFamilies();
1877 const int rightFamilyCount = basisValuesRight.basisValues().
numFamilies();
1881 int numTensorComponentsLeft = -1;
1882 const bool isVectorValued = basisValuesLeft.
vectorData().isValid();
1885 const bool rightIsVectorValued = basisValuesRight.
vectorData().isValid();
1886 INTREPID2_TEST_FOR_EXCEPTION(!rightIsVectorValued, std::invalid_argument,
"left and right must either both be vector-valued, or both scalar-valued");
1887 const auto &refVectorLeft = basisValuesLeft.
vectorData();
1888 int numFamiliesLeft = refVectorLeft.numFamilies();
1889 int numVectorComponentsLeft = refVectorLeft.numComponents();
1890 Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1891 Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
1892 for (
int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1894 for (
int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1899 if (numTensorComponentsLeft == -1)
1903 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.
numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1904 for (
int r=0; r<numTensorComponentsLeft; r++)
1911 int numTensorComponentsRight = -1;
1912 const auto &refVectorRight = basisValuesRight.
vectorData();
1913 int numFamiliesRight = refVectorRight.numFamilies();
1914 int numVectorComponentsRight = refVectorRight.numComponents();
1915 for (
int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
1917 for (
int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
1919 const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
1920 if (tensorData.numTensorComponents() > 0)
1922 if (numTensorComponentsRight == -1)
1924 numTensorComponentsRight = tensorData.numTensorComponents();
1926 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataRight must have the same number of tensor components as every other");
1927 for (
int r=0; r<numTensorComponentsRight; r++)
1929 maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
1934 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != numVectorComponentsRight, std::invalid_argument,
"Left and right vector entries must have the same number of tensorial components");
1939 for (
int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
1941 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().
tensorData(familyOrdinal).
numTensorComponents() != numTensorComponentsLeft, std::invalid_argument,
"All families must match in the number of tensor components");
1945 for (
int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
1947 INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().
tensorData(familyOrdinal).
numTensorComponents() != numTensorComponentsLeft, std::invalid_argument,
"Right families must match left in the number of tensor components");
1952 if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.
axisAligned() && basisValuesRight.
axisAligned())
1961 Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
1962 for (
int r=0; r<numPointTensorComponents; r++)
1969 Kokkos::View<Scalar**, DeviceType> integralView2;
1970 Kokkos::View<Scalar***, DeviceType> integralView3;
1971 if (integralViewRank == 3)
1979 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
1984 const int leftVectorComponentCount = isVectorValued ? basisValuesLeft.
vectorData().numComponents() : 1;
1985 for (
int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
1988 : basisValuesLeft.basisValues().
tensorData(leftFamilyOrdinal);
1994 const int leftDimSpan = leftComponent.
extent_int(2);
1996 const int leftComponentFieldCount = leftComponent.
extent_int(0);
1998 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2001 int rightFieldOffset = basisValuesRight.
vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2003 const int rightVectorComponentCount = isVectorValued ? basisValuesRight.
vectorData().numComponents() : 1;
2004 for (
int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2007 : basisValuesRight.basisValues().
tensorData(rightFamilyOrdinal);
2008 if (!rightComponent.
isValid())
2013 const int rightDimSpan = rightComponent.
extent_int(2);
2015 const int rightComponentFieldCount = rightComponent.
extent_int(0);
2018 if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset))
2020 b_offset += rightDimSpan;
2026 INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error,
"left and right dimension offsets should match.");
2027 INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
2029 const int d_start = a_offset;
2030 const int d_end = d_start + leftDimSpan;
2032 using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2033 ComponentIntegralsArray componentIntegrals;
2034 for (
int r=0; r<numPointTensorComponents; r++)
2051 const int numPoints = pointDimensions[r];
2058 const int leftTensorComponentRank = leftTensorComponent.
rank();
2059 const int leftTensorComponentDimSpan = leftTensorComponent.
extent_int(2);
2060 const int leftTensorComponentFields = leftTensorComponent.
extent_int(0);
2061 const int rightTensorComponentRank = rightTensorComponent.
rank();
2062 const int rightTensorComponentDimSpan = rightTensorComponent.
extent_int(2);
2063 const int rightTensorComponentFields = rightTensorComponent.
extent_int(0);
2065 INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
2067 for (
int d=d_start; d<d_end; d++)
2069 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
2070 if (allocateFadStorage)
2073 componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2077 componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2080 auto componentIntegralView = componentIntegrals[r][d];
2082 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2084 for (
int a=0; a<leftTensorComponentDimSpan; a++)
2086 Kokkos::parallel_for(
"compute componentIntegrals", policy,
2087 KOKKOS_LAMBDA (
const int &i,
const int &j) {
2088 Scalar refSpaceIntegral = 0.0;
2089 for (
int ptOrdinal=0; ptOrdinal<numPoints; ptOrdinal++)
2091 const Scalar & leftValue = ( leftTensorComponentRank == 2) ? leftTensorComponent(i,ptOrdinal) : leftTensorComponent(i,ptOrdinal,a);
2092 const Scalar & rightValue = (rightTensorComponentRank == 2) ? rightTensorComponent(j,ptOrdinal) : rightTensorComponent(j,ptOrdinal,a);
2093 refSpaceIntegral += leftValue * rightValue * quadratureWeights(ptOrdinal);
2095 componentIntegralView(i,j) = refSpaceIntegral;
2099 if (approximateFlops != NULL)
2101 *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3);
2106 ExecutionSpace().fence();
2108 Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount};
2109 Kokkos::Array<int,3> lowerBounds {0,0,0};
2110 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2112 Kokkos::parallel_for(
"compute field integrals", policy,
2113 KOKKOS_LAMBDA (
const int &cellDataOrdinal,
const int &leftFieldOrdinal,
const int &rightFieldOrdinal) {
2114 const Scalar & cellMeasureWeight = cellMeasures.
getTensorComponent(0)(cellDataOrdinal);
2122 Scalar integralSum = 0.0;
2123 for (
int d=d_start; d<d_end; d++)
2125 const Scalar & transformLeft_d = basisValuesLeft.
transformWeight(cellDataOrdinal,0,d,d);
2126 const Scalar & transformRight_d = basisValuesRight.
transformWeight(cellDataOrdinal,0,d,d);
2128 const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2131 Scalar integral_d = 1.0;
2133 for (
int r=0; r<numPointTensorComponents; r++)
2135 integral_d *= componentIntegrals[r][d](leftTensorIterator.
argument(r),rightTensorIterator.
argument(r));
2138 integralSum += leftRightTransform_d * integral_d;
2141 const int i = leftFieldOrdinal + leftFieldOffset;
2142 const int j = rightFieldOrdinal + rightFieldOffset;
2144 if (integralViewRank == 3)
2146 integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2150 integralView2(i,j) += cellMeasureWeight * integralSum;
2154 b_offset += rightDimSpan;
2157 a_offset += leftDimSpan;
2161 if (approximateFlops != NULL)
2164 *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2172 const bool transposeLeft =
true;
2173 const bool transposeRight =
false;
2177 const bool matrixTransform = (leftTransform.
rank() == 4) || (rightTransform.
rank() == 4);
2182 if (matrixTransform)
2184 composedTransform = leftTransform.
allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
2185 composedTransform.
storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2188 if (approximateFlops != NULL)
2199 const int newRank = 4;
2200 auto extents = composedTransform.
getExtents();
2202 composedTransform = composedTransform.
shallowCopy(newRank, extents, variationTypes);
2203 if (approximateFlops != NULL)
2209 else if (leftTransform.
isValid())
2212 composedTransform = leftTransform;
2214 else if (rightTransform.
isValid())
2217 composedTransform = rightTransform;
2222 Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.
numCells(),basisValuesLeft.
numPoints(),spaceDim,spaceDim};
2225 Kokkos::View<Scalar*,DeviceType> identityUnderlyingView(
"Intrepid2::FST::integrate() - identity view",spaceDim);
2226 Kokkos::deep_copy(identityUnderlyingView, 1.0);
2234 const int leftFamilyCount = basisValuesLeft. basisValues().numFamilies();
2235 const int rightFamilyCount = basisValuesRight.basisValues().
numFamilies();
2236 const int leftComponentCount = isVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2237 const int rightComponentCount = isVectorValued ? basisValuesRight.
vectorData().numComponents() : 1;
2239 int leftFieldOrdinalOffset = 0;
2240 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2245 bool haveLaunchedContributionToCurrentFamilyLeft =
false;
2246 for (
int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2249 : basisValuesLeft.basisValues().
tensorData(leftFamilyOrdinal);
2253 a_offset += basisValuesLeft.
vectorData().numDimsForComponent(leftComponentOrdinal);
2257 int rightFieldOrdinalOffset = 0;
2258 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2262 bool haveLaunchedContributionToCurrentFamilyRight =
false;
2264 for (
int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2267 : basisValuesRight.basisValues().
tensorData(rightFamilyOrdinal);
2268 if (!rightComponent.
isValid())
2271 b_offset += basisValuesRight.
vectorData().numDimsForComponent(rightComponentOrdinal);
2277 const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2278 Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2287 bool forceNonSpecialized =
false;
2288 bool usePointCacheForRank3Tensor =
true;
2292 if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2294 ExecutionSpace().fence();
2296 haveLaunchedContributionToCurrentFamilyLeft =
true;
2297 haveLaunchedContributionToCurrentFamilyRight =
true;
2299 if (integralViewRank == 2)
2303 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2305 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2306 const int teamSize = functor.teamSize(recommendedTeamSize);
2308 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2310 Kokkos::parallel_for(
"F_IntegratePointValueCache rank 2", policy, functor);
2312 if (approximateFlops != NULL)
2314 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.
getDataExtent(0);
2319 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2321 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2322 const int teamSize = functor.teamSize(recommendedTeamSize);
2324 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2326 Kokkos::parallel_for(
"F_Integrate rank 2", policy, functor);
2328 if (approximateFlops != NULL)
2330 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.
getDataExtent(0);
2334 else if (integralViewRank == 3)
2338 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2340 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2341 const int teamSize = functor.teamSize(recommendedTeamSize);
2343 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2345 Kokkos::parallel_for(
"F_IntegratePointValueCache rank 3", policy, functor);
2347 if (approximateFlops != NULL)
2349 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.
getDataExtent(0);
2354 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2356 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2357 const int teamSize = functor.teamSize(recommendedTeamSize);
2359 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2361 Kokkos::parallel_for(
"F_Integrate rank 3", policy, functor);
2363 if (approximateFlops != NULL)
2365 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.
getDataExtent(0);
2369 b_offset += isVectorValued ? basisValuesRight.
vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2371 rightFieldOrdinalOffset += isVectorValued ? basisValuesRight.
vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2373 a_offset += isVectorValued ? basisValuesLeft.
vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2375 leftFieldOrdinalOffset += isVectorValued ? basisValuesLeft.
vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2382 ExecutionSpace().fence();
KOKKOS_INLINE_FUNCTION DimensionInfo combinedDimensionInfo(const DimensionInfo &myData, const DimensionInfo &otherData)
Returns DimensionInfo for a Data container that combines (through multiplication, say,...
@ GENERAL
arbitrary variation
@ BLOCK_PLUS_DIAGONAL
one of two dimensions in a matrix; bottom-right part of matrix is diagonal
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< std::is_pod< T >::value, unsigned >::type dimension_scalar(const Kokkos::DynRankView< T, P... >)
specialization of functions for pod types, returning the scalar dimension (1 for pod types) of a view...
#define INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(test, x, msg)
KOKKOS_INLINE_FUNCTION int familyFieldOrdinalOffset(const int &familyOrdinal) const
Returns the field ordinal offset for the specified family.
KOKKOS_INLINE_FUNCTION int numFamilies() const
For valid vectorData, returns the number of families in vectorData; otherwise, returns number of Tens...
TensorDataType & tensorData()
TensorData accessor for single-family scalar data.
Wrapper around a Kokkos::View that allows data that is constant or repeating in various logical dimen...
void storeInPlaceProduct(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
stores the in-place (entrywise) product, A .* B, into this container.
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1.
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ****, DeviceType > & getUnderlyingView4() const
returns the View that stores the unique data. For rank-4 underlying containers.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
returns true for containers that have data; false for those that don't (namely, those that have been ...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
static Data< DataScalar, DeviceType > allocateInPlaceCombinationResult(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
KOKKOS_INLINE_FUNCTION const Kokkos::Array< DataVariationType, 7 > & getVariationTypes() const
Returns an array with the variation types in each logical dimension.
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION Kokkos::Array< int, 7 > getExtents() const
Returns an array containing the logical extents in each dimension.
KOKKOS_INLINE_FUNCTION bool underlyingMatchesLogical() const
Returns true if the underlying container has exactly the same rank and extents as the logical contain...
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
Data shallowCopy(const int rank, const Kokkos::Array< int, 7 > &extents, const Kokkos::Array< DataVariationType, 7 > &variationTypes)
Creates a new Data object with the same underlying view, but with the specified logical rank,...
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts,...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums,...
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums,...
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments.
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don't (e.g., those that have been co...
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
Struct expressing all variation information about a Data object in a single dimension,...