17#include "llvm/ADT/ArrayRef.h"
18#include "llvm/ADT/DenseSet.h"
19#include "llvm/ADT/Sequence.h"
20#include "llvm/ADT/SetOperations.h"
21#include "llvm/ADT/SmallVector.h"
22#include "llvm/ADT/StringRef.h"
23#include "llvm/ADT/iterator_range.h"
24#include "llvm/Analysis/TargetTransformInfo.h"
25#include "llvm/IR/DataLayout.h"
26#include "llvm/IR/Function.h"
27#include "llvm/IR/Module.h"
28#include "llvm/Support/CommandLine.h"
29#include "llvm/Support/Debug.h"
30#include "llvm/Support/TypeSize.h"
31#include "llvm/Support/raw_ostream.h"
45#define DEBUG_TYPE "polly-opt-isl"
55 "polly-target-latency-vector-fma",
56 cl::desc(
"The minimal number of cycles between issuing two "
57 "dependent consecutive vector fused multiply-add "
62 "polly-target-throughput-vector-fma",
63 cl::desc(
"A throughput of the processor floating-point arithmetic units "
64 "expressed in the number of vector fused multiply-add "
65 "instructions per clock cycle."),
69 "polly-target-1st-cache-level-size",
70 cl::desc(
"The size of the first cache level specified in bytes."),
74 "polly-target-1st-cache-level-default-size",
75 cl::desc(
"The default size of the first cache level specified in bytes"
76 " (if not enough were provided by the TargetTransformInfo)."),
80 "polly-target-2nd-cache-level-size",
81 cl::desc(
"The size of the second level specified in bytes."), cl::Hidden,
85 "polly-target-2nd-cache-level-default-size",
86 cl::desc(
"The default size of the second cache level specified in bytes"
87 " (if not enough were provided by the TargetTransformInfo)."),
98 "polly-target-1st-cache-level-associativity",
99 cl::desc(
"The associativity of the first cache level."), cl::Hidden,
103 "polly-target-1st-cache-level-default-associativity",
104 cl::desc(
"The default associativity of the first cache level"
105 " (if not enough were provided by the TargetTransformInfo)."),
109 "polly-target-2nd-cache-level-associativity",
110 cl::desc(
"The associativity of the second cache level."), cl::Hidden,
114 "polly-target-2nd-cache-level-default-associativity",
115 cl::desc(
"The default associativity of the second cache level"
116 " (if not enough were provided by the TargetTransformInfo)."),
120 "polly-target-vector-register-bitwidth",
121 cl::desc(
"The size in bits of a vector register (if not set, this "
122 "information is taken from LLVM's target information."),
126 "polly-pattern-matching-nc-quotient",
127 cl::desc(
"Quotient that is obtained by dividing Nc, the parameter of the"
128 "macro-kernel, by Nr, the parameter of the micro-kernel"),
133 cl::desc(
"Perform optimizations of tensor contractions based "
134 "on pattern matching"),
139 cl::desc(
"Perform optimizations of matrix multiplications "
140 "based on pattern matching"),
144 "polly-tc-dependences-computeout",
145 cl::desc(
"Bound the dependence analysis by a maximal amount of "
146 "computational steps (0 means no bound)"),
147 cl::Hidden, cl::init(500000), cl::ZeroOrMore, cl::cat(
PollyCategory));
154struct MicroKernelParamsTy {
163struct MacroKernelParamsTy {
174 MemoryAccess *
A =
nullptr;
175 MemoryAccess *
B =
nullptr;
176 MemoryAccess *ReadFromC =
nullptr;
177 MemoryAccess *WriteToC =
nullptr;
217 MemoryAccess *
A =
nullptr;
218 MemoryAccess *
B =
nullptr;
224 MemoryAccess *ReadFromC =
nullptr;
225 MemoryAccess *WriteToC =
nullptr;
231 SmallDenseSet<int> I;
232 SmallDenseSet<int> J;
237 SmallDenseSet<int> P;
245 SmallVector<int> DimensionSizes;
246 SmallVector<int> ADimensions;
247 SmallVector<int> BDimensions;
248 SmallVector<int> CDimensions;
254 SmallVector<int> OrderedI;
255 SmallVector<int> OrderedJ;
256 SmallVector<int> OrderedP;
264static isl::union_set getUnrollIsolatedSetOptions(isl::ctx
Ctx) {
265 isl::space Space = isl::space(
Ctx, 0, 0, 1);
269 UnrollIsolatedSetOption =
271 UnrollIsolatedSetOption =
273 return UnrollIsolatedSetOption.
wrap();
286static isl::map permuteDimensions(isl::map Map,
isl::dim DimType,
287 unsigned DstPos,
unsigned SrcPos) {
290 if (DstPos == SrcPos)
299 auto MaxDim = std::max(DstPos, SrcPos);
300 auto MinDim = std::min(DstPos, SrcPos);
301 Map = Map.
move_dims(FreeDim, 0, DimType, MaxDim, 1);
302 Map = Map.
move_dims(FreeDim, 0, DimType, MinDim, 1);
303 Map = Map.
move_dims(DimType, MinDim, FreeDim, 1, 1);
304 Map = Map.
move_dims(DimType, MaxDim, FreeDim, 0, 1);
324static bool isMatMulOperandAcc(
isl::set Domain, isl::map AccMap,
int &FirstPos,
339 int FirstDims[] = {0, 0, 1, 1, 2, 2};
340 int SecondDims[] = {1, 2, 2, 0, 0, 1};
341 for (
int i = 0; i < 6; i += 1) {
342 auto PossibleMatMul =
354 if (AccMap.
is_equal(PossibleMatMul)) {
355 if (FirstPos != -1 && FirstPos != FirstDims[i])
357 FirstPos = FirstDims[i];
358 if (SecondPos != -1 && SecondPos != SecondDims[i])
360 SecondPos = SecondDims[i];
379static bool isMatMulNonScalarReadAccess(MemoryAccess *MemAccess,
385 if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.j) && !MMI.ReadFromC) {
386 MMI.ReadFromC = MemAccess;
389 if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.i, MMI.k) && !MMI.A) {
393 if (isMatMulOperandAcc(StmtDomain, AccMap, MMI.k, MMI.j) && !MMI.B) {
413static bool containsOnlyMatrMultAcc(isl::map PartialSchedule,
416 auto *Stmt =
static_cast<ScopStmt *
>(InputDimId.get_user());
418 assert(OutDimNum > 2 &&
"In case of the matrix multiplication the loop nest "
419 "and, consequently, the corresponding scheduling "
420 "functions have at least three dimensions.");
422 permuteDimensions(PartialSchedule,
isl::dim::out, MMI.i, OutDimNum - 1);
424 permuteDimensions(PartialSchedule,
isl::dim::out, MMI.j, OutDimNum - 1);
426 permuteDimensions(PartialSchedule,
isl::dim::out, MMI.k, OutDimNum - 1);
429 for (
auto *MemA = Accesses.begin(); MemA != Accesses.end() - 1; MemA++) {
430 auto *MemAccessPtr = *MemA;
431 if (MemAccessPtr->isLatestArrayKind() && MemAccessPtr != MMI.WriteToC &&
432 !isMatMulNonScalarReadAccess(MemAccessPtr, MMI) &&
433 !(MemAccessPtr->isStrideZero(MapI) &&
434 MemAccessPtr->isStrideZero(MapJ) && MemAccessPtr->isStrideZero(MapK)))
454static bool containsOnlyMatMulDep(isl::map Schedule,
const Dependences *D,
459 Dep = Dep.
unite(Red);
464 for (
int i = 0; i < DeltasDimNum; i++) {
466 Pos = Pos < 0 && Val.is_one() ? i : Pos;
467 if (Val.is_nan() || !(Val.is_zero() || (i == Pos && Val.is_one())))
470 if (DeltasDimNum == 0 || Pos < 0)
498static bool containsMatrMult(isl::map PartialSchedule,
const Dependences *D,
501 auto *Stmt =
static_cast<ScopStmt *
>(InputDimsId.get_user());
502 if (Stmt->size() <= 1)
506 for (
auto *MemA = Accesses.end() - 1; MemA != Accesses.begin(); MemA--) {
507 auto *MemAccessPtr = *MemA;
508 if (!MemAccessPtr->isLatestArrayKind())
510 if (!MemAccessPtr->isWrite())
512 auto AccMap = MemAccessPtr->getLatestAccessRelation();
513 if (!isMatMulOperandAcc(Stmt->getDomain(), AccMap, MMI.i, MMI.j))
515 MMI.WriteToC = MemAccessPtr;
519 if (!containsOnlyMatMulDep(PartialSchedule, D, MMI.k))
522 if (!MMI.WriteToC || !containsOnlyMatrMultAcc(PartialSchedule, MMI))
525 if (!MMI.A || !MMI.B || !MMI.ReadFromC)
537static isl::schedule_node permuteBandNodeDimensions(isl::schedule_node Node,
539 unsigned SecondDim) {
542 std::max(FirstDim, SecondDim));
543 auto PartialSchedule =
545 auto PartialScheduleFirstDim = PartialSchedule.at(FirstDim);
546 auto PartialScheduleSecondDim = PartialSchedule.at(SecondDim);
548 PartialSchedule.set_union_pw_aff(SecondDim, PartialScheduleFirstDim);
550 PartialSchedule.set_union_pw_aff(FirstDim, PartialScheduleSecondDim);
555static isl::schedule_node
556createMicroKernel(isl::schedule_node Node,
557 MicroKernelParamsTy MicroKernelParams) {
561 return permuteBandNodeDimensions(Node, 0, 1).
child(0).
child(0);
574static isl::schedule_node
575createMacroKernel(isl::schedule_node Node,
576 MacroKernelParamsTy MacroKernelParams) {
578 if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
579 MacroKernelParams.Kc == 1)
582 std::vector<int> TileSizes(DimOutNum, 1);
583 TileSizes[DimOutNum - 3] = MacroKernelParams.Mc;
584 TileSizes[DimOutNum - 2] = MacroKernelParams.Nc;
585 TileSizes[DimOutNum - 1] = MacroKernelParams.Kc;
586 Node =
tileNode(Node,
"1st level tiling", TileSizes, 1);
588 Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1);
589 Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);
600static uint64_t getMatMulAlignTypeSize(
const MatMulInfoTy &MMI) {
602 auto &DL =
S->
getFunction().getParent()->getDataLayout();
605 auto ElementSizeC = DL.getTypeAllocSize(MMI.WriteToC->
getElementType());
606 return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
615static uint64_t getMatMulTypeSize(
const MatMulInfoTy &MMI) {
617 auto &DL =
S->
getFunction().getParent()->getDataLayout();
618 auto ElementSizeA = DL.getTypeSizeInBits(MMI.A->
getElementType());
619 auto ElementSizeB = DL.getTypeSizeInBits(MMI.B->
getElementType());
620 auto ElementSizeC = DL.getTypeSizeInBits(MMI.WriteToC->
getElementType());
621 return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
636static MicroKernelParamsTy getMicroKernelParams(
const TargetTransformInfo *TTI,
637 const MatMulInfoTy &MMI) {
638 assert(TTI &&
"The target transform info should be provided.");
644 if (RegisterBitwidth == -1)
646 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
647 auto ElementSize = getMatMulTypeSize(MMI);
648 assert(ElementSize > 0 &&
"The element size of the matrix multiplication "
649 "operands should be greater than zero.");
650 auto Nvec = RegisterBitwidth / ElementSize;
663static void getTargetCacheParameters(
const llvm::TargetTransformInfo *TTI) {
664 auto L1DCache = llvm::TargetTransformInfo::CacheLevel::L1D;
665 auto L2DCache = llvm::TargetTransformInfo::CacheLevel::L2D;
667 if (TTI->getCacheSize(L1DCache))
673 if (TTI->getCacheSize(L2DCache))
679 if (TTI->getCacheAssociativity(L1DCache))
681 TTI->getCacheAssociativity(L1DCache).value();
687 if (TTI->getCacheAssociativity(L2DCache))
689 TTI->getCacheAssociativity(L2DCache).value();
711static MacroKernelParamsTy
712getMacroKernelParams(
const llvm::TargetTransformInfo *TTI,
713 const MicroKernelParamsTy &MicroKernelParams,
714 const MatMulInfoTy &MMI) {
715 getTargetCacheParameters(TTI);
721 if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
730 (1 +
static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
739 auto ElementSize = getMatMulAlignTypeSize(MMI);
740 assert(ElementSize > 0 &&
"The element size of the matrix multiplication "
741 "operands should be greater than zero.");
750 assert(Mc > 0 && Nc > 0 && Kc > 0 &&
751 "Matrix block sizes should be greater than zero");
781static isl::map getMatMulAccRel(isl::map MapOldIndVar,
unsigned FirstDim,
782 unsigned SecondDim) {
783 auto AccessRelSpace = isl::space(MapOldIndVar.
ctx(), 0, 9, 3);
791static isl::schedule_node createExtensionNode(isl::schedule_node Node,
792 isl::map ExtensionMap) {
793 auto Extension = isl::union_map(ExtensionMap);
798static isl::schedule_node optimizePackedB(isl::schedule_node Node,
799 ScopStmt *Stmt, isl::map MapOldIndVar,
800 MicroKernelParamsTy MicroParams,
801 MacroKernelParamsTy MacroParams,
807 unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
808 unsigned SecondDimSize = MacroParams.Kc;
809 unsigned ThirdDimSize = MicroParams.Nr;
810 ScopArrayInfo *PackedB =
812 {FirstDimSize, SecondDimSize, ThirdDimSize});
816 isl::map AccRelPackedB = getMatMulAccRel(MapOldIndVar, 3, 7);
821 ScopStmt *CopyStmt =
S->addScopStmt(AccRelB, AccRelPackedB,
Domain);
832 return createExtensionNode(Node, ExtMap);
835static isl::schedule_node optimizePackedA(isl::schedule_node Node, ScopStmt *,
836 isl::map MapOldIndVar,
837 MicroKernelParamsTy MicroParams,
838 MacroKernelParamsTy MacroParams,
841 ScopStmt *Stmt =
static_cast<ScopStmt *
>(InputDimsId.
get_user());
843 isl::id DomainId =
Domain.get_tuple_id();
846 unsigned FirstDimSize = MacroParams.Mc / MicroParams.Mr;
847 unsigned SecondDimSize = MacroParams.Kc;
848 unsigned ThirdDimSize = MicroParams.Mr;
851 {FirstDimSize, SecondDimSize, ThirdDimSize});
855 isl::map AccRelPackedA = getMatMulAccRel(MapOldIndVar, 4, 6);
859 isl::map PackedATranslator = AccRelPackedA.
apply_domain(AccRelA);
868 isl::map OuterDomainMap =
873 isl::map DomainTranslator = OuterDomainMap.
range_product(CopyFrom);
881 isl::map CopyTo = CopyFrom.
apply_range(PackedATranslator);
892 return createExtensionNode(Node, ExtScatterCopy);
927static isl::schedule_node
928optimizeDataLayoutMatrMulPattern(isl::schedule_node Node, isl::map MapOldIndVar,
929 MicroKernelParamsTy MicroParams,
930 MacroKernelParamsTy MacroParams,
933 ScopStmt *Stmt =
static_cast<ScopStmt *
>(InputDimsId.
get_user());
938 Node = Node.
child(0);
940 optimizePackedB(Node, Stmt, MapOldIndVar, MicroParams, MacroParams, MMI);
942 Node = Node.
child(0);
944 optimizePackedA(Node, Stmt, MapOldIndVar, MicroParams, MacroParams, MMI);
962getInductionVariablesSubstitution(isl::schedule_node Node,
963 MicroKernelParamsTy MicroKernelParams,
964 MacroKernelParamsTy MacroKernelParams) {
965 auto Child = Node.
child(0);
988static isl::schedule_node
989isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node,
990 MicroKernelParamsTy MicroKernelParams) {
991 isl::schedule_node Child = Node.
child(0);
1000 isl::union_set IsolateOption =
1002 isl::ctx
Ctx = Node.
ctx();
1004 Options = Options.
unite(getUnrollIsolatedSetOptions(
Ctx));
1005 Node = Node.
as<isl::schedule_node_band>().set_ast_build_options(Options);
1009 Node = Node.
as<isl::schedule_node_band>().set_ast_build_options(Options);
1018static isl::schedule_node markLoopVectorizerDisabled(isl::schedule_node Node) {
1031static isl::schedule_node
1032getBandNodeWithOriginDimOrder(isl::schedule_node Node) {
1043 auto PartialSchedulePwAff =
Domain.identity_union_pw_multi_aff();
1044 auto PartialScheduleMultiPwAff =
1045 isl::multi_union_pw_aff(PartialSchedulePwAff);
1046 PartialScheduleMultiPwAff =
1051static isl::schedule_node optimizeMatMulPattern(isl::schedule_node Node,
1052 const TargetTransformInfo *TTI,
1053 MatMulInfoTy &MMI) {
1054 assert(TTI &&
"The target transform info should be provided.");
1056 assert(DimOutNum > 2 &&
"In case of the matrix multiplication the loop nest "
1057 "and, consequently, the corresponding scheduling "
1058 "functions have at least three dimensions.");
1059 Node = getBandNodeWithOriginDimOrder(Node);
1060 Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3);
1061 int NewJ = MMI.j == DimOutNum - 3 ? MMI.i : MMI.j;
1062 int NewK = MMI.k == DimOutNum - 3 ? MMI.i : MMI.k;
1063 Node = permuteBandNodeDimensions(Node, NewJ, DimOutNum - 2);
1064 NewK = NewK == DimOutNum - 2 ? NewJ : NewK;
1065 Node = permuteBandNodeDimensions(Node, NewK, DimOutNum - 1);
1066 auto MicroKernelParams = getMicroKernelParams(TTI, MMI);
1067 auto MacroKernelParams = getMacroKernelParams(TTI, MicroKernelParams, MMI);
1068 Node = createMacroKernel(Node, MacroKernelParams);
1069 Node = createMicroKernel(Node, MicroKernelParams);
1070 if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
1071 MacroKernelParams.Kc == 1)
1073 auto MapOldIndVar = getInductionVariablesSubstitution(Node, MicroKernelParams,
1077 Node = markLoopVectorizerDisabled(Node.
parent()).
child(0);
1078 Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams);
1079 return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
1080 MacroKernelParams, MMI);
1103static bool isMatrMultPattern(isl::schedule_node Node,
const Dependences *D,
1104 MatMulInfoTy &MMI) {
1112 if (containsMatrMult(NewPartialSchedule, D, MMI))
1126static int getDimSize(
const ScopArrayInfo *SAI,
unsigned Pos) {
1131 auto *ConstantDimSize = dyn_cast<const SCEVConstant>(SCEVDimSize);
1133 auto *IntDimSize = dyn_cast<ConstantInt>(ConstantDimSize->getValue());
1135 return IntDimSize->getSExtValue();
1149 ArrayRef<int> Dimensions) {
1159 for (
unsigned i = 0; i < Dimensions.size(); i++) {
1160 const int InPos = Dimensions[i];
1161 if ((InPos >=
static_cast<int>(DimInSize)) || (InPos < 0))
1173 return AccMap.
is_equal(PossibleTensor);
1190 SmallDenseSet<int> &IndexSet,
1191 SmallVectorImpl<int> &DimensionSizes,
1192 SmallVectorImpl<int> &Dimensions) {
1195 assert(SAI &&
"AccMap should represent memory access");
1216 for (
unsigned i = 0; i < OutDimNum; i++)
1220 Dimensions.assign(OutDimNum, -1);
1227 if (ValAPInt.isSignedIntN(32))
1228 OutPos = ValAPInt.getSExtValue();
1229 if ((OutPos < 0) || (OutPos >=
static_cast<int>(OutDimNum)) ||
1233 Dimensions[OutPos] = i;
1234 if (DimensionSizes[i] <= 0)
1235 DimensionSizes[i] = getDimSize(SAI, OutPos);
1238 return isCorrectAccessMap(
Domain, AccMap, Dimensions);
1247static SmallDenseSet<int>
intersect(
const SmallDenseSet<int> &
A,
1248 const SmallDenseSet<int> &
B) {
1249 SmallDenseSet<int> Intersection =
A;
1250 set_intersect(Intersection,
B);
1251 return Intersection;
1260static bool isSuperset(
const SmallDenseSet<int> &
A,
1261 const SmallDenseSet<int> &
B) {
1271static SmallDenseSet<int> unite(
const SmallDenseSet<int> &
A,
1272 const SmallDenseSet<int> &
B) {
1273 SmallDenseSet<int> Union =
A;
1274 set_union(Union,
B);
1287static MemoryAccess *getWriteAccess(
isl::set Domain, ScopStmt *Stmt,
1289 SmallDenseSet<int> &IandJIndexSet) {
1290 TCI.WriteToC =
nullptr;
1292 for (MemoryAccess *MemA : reverse(Accesses)) {
1294 if (!MemA->isLatestArrayKind())
1297 if (!MemA->isWrite())
1300 isl::map AccMap = MemA->getLatestAccessRelation();
1301 if (!isTCOperandAcc(
Domain, AccMap, IandJIndexSet, TCI.DimensionSizes,
1320static bool setReadAccess(MemoryAccess *MemAccessPtr,
1321 const SmallDenseSet<int> &IndexSet,
1322 const SmallDenseSet<int> &IandJIndexSet,
1323 ArrayRef<int> Dimensions, TCInfoTy &TCI) {
1326 if (!isSuperset(IndexSet, TCI.P))
1330 TCI.I = set_difference(IndexSet, TCI.P);
1331 if (!isSuperset(IandJIndexSet, TCI.I))
1335 TCI.J = set_difference(IandJIndexSet, TCI.I);
1338 TCI.A = MemAccessPtr;
1339 llvm::replace(TCI.ADimensions, TCI.ADimensions.begin(),
1340 TCI.ADimensions.end(), Dimensions.begin(), Dimensions.end());
1346 if (unite(TCI.P, TCI.J) != IndexSet)
1350 TCI.B = MemAccessPtr;
1351 llvm::replace(TCI.BDimensions, TCI.BDimensions.begin(),
1352 TCI.BDimensions.end(), Dimensions.begin(), Dimensions.end());
1369static bool setReadAccesses(
isl::set Domain, ScopStmt *Stmt, TCInfoTy &TCI,
1370 SmallDenseSet<int> &IandJIndexSet) {
1373 TCI.ReadFromC =
nullptr;
1375 for (
auto *MemA = Accesses.begin(); *MemA != TCI.WriteToC; MemA++) {
1376 MemoryAccess *MemAccessPtr = *MemA;
1398 TCI.ReadFromC = MemAccessPtr;
1402 SmallVector<int> Dimensions;
1403 SmallDenseSet<int> IndexSet;
1404 if (!isTCOperandAcc(
Domain, AccMap, IndexSet, TCI.DimensionSizes,
1408 if (!setReadAccess(MemAccessPtr, IndexSet, IandJIndexSet, Dimensions, TCI))
1414 return TCI.ReadFromC && TCI.A && TCI.B;
1429static bool containsOnlyTCAcc(
isl::set Domain, isl::map PartialSchedule,
1432 ScopStmt *Stmt =
static_cast<ScopStmt *
>(InputDimsId.
get_user());
1440 TCI.DimensionSizes.resize(DimNum);
1441 SmallDenseSet<int> IandJIndexSet;
1443 TCI.WriteToC = getWriteAccess(
Domain, Stmt, TCI, IandJIndexSet);
1447 if (
intersect(IandJIndexSet, TCI.P).size() != 0)
1450 if (!setReadAccesses(
Domain, Stmt, TCI, IandJIndexSet))
1478static bool isReductionCarriedOverDim(
isl::set DepDelta,
unsigned Dim,
1479 isl::pw_multi_aff BoundDeltas,
1480 const SmallDenseSet<int> &IndexSet) {
1481 isl::space Space = DepDelta.
get_space();
1483 for (
unsigned i = 0; i < Dim; i += 1)
1502 if (!IndexSet.count(i)) {
1564static bool areDepsOverCompleteDomain(
isl::set Domain, isl::map DepsForStmt,
1565 isl::pw_multi_aff UpperBound,
1566 SmallDenseSet<int> &IndexSet) {
1571 for (
const auto It : IndexSet) {
1579 Domain.subtract(DomainRed));
1599static bool containsOnlyTcDeps(isl::map Schedule,
const Dependences *D,
1603 isl::union_map Dep =
1611 isl::pw_multi_aff LowerBound =
Domain.lexmin_pw_multi_aff();
1612 isl::pw_multi_aff UpperBound =
Domain.lexmax_pw_multi_aff();
1613 isl::pw_multi_aff BoundDeltas = UpperBound.
sub(LowerBound);
1623 if (!isReductionCarriedOverDim(Intersection, i, BoundDeltas, IndexSet))
1627 DepDeltas = DepDeltas.
subtract(Intersection);
1635 return areDepsOverCompleteDomain(
Domain, DepsForStmt, UpperBound, IndexSet);
1666static bool containsTCInfoTy(isl::map PartialSchedule,
const Dependences *D,
1668 if (!containsOnlyTcDeps(PartialSchedule, D, TCI.P,
Domain))
1672 if (TCI.P.size() == 0)
1675 if (!containsOnlyTCAcc(
Domain, PartialSchedule, TCI))
1679 if ((TCI.I.size() == 0) || (TCI.J.size() == 0))
1740static bool isTCPattern(isl::schedule_node Node,
const Dependences *D,
1742 Node = Node.
child(0);
1799 if (!Node.
parent().
isa<isl::schedule_node_sequence>() ||
1814 if (containsTCInfoTy(PartialScheduleMap, D, TCI,
isl::set(
Domain)))
1824 const llvm::TargetTransformInfo *TTI,
1828 POLLY_DEBUG(dbgs() <<
"The tensor contraction pattern was detected\n");
1831 POLLY_DEBUG(dbgs() <<
"The matrix multiplication pattern was detected\n");
1832 return optimizeMatMulPattern(Node, TTI, MMI);
static cl::opt< int > OptComputeOut("polly-dependences-computeout", cl::desc("Bound the dependence analysis by a maximal amount of " "computational steps (0 means no bound)"), cl::Hidden, cl::init(500000), cl::cat(PollyCategory))
static cl::opt< int > FirstCacheLevelDefaultSize("polly-target-1st-cache-level-default-size", cl::desc("The default size of the first cache level specified in bytes" " (if not enough were provided by the TargetTransformInfo)."), cl::Hidden, cl::init(32768), cl::cat(PollyCategory))
static cl::opt< int > SecondCacheLevelDefaultAssociativity("polly-target-2nd-cache-level-default-associativity", cl::desc("The default associativity of the second cache level" " (if not enough were provided by the TargetTransformInfo)."), cl::Hidden, cl::init(8), cl::cat(PollyCategory))
static cl::opt< bool > PMBasedMMMOpts("polly-matmul-opt", cl::desc("Perform optimizations of matrix multiplications " "based on pattern matching"), cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory))
static cl::opt< int > FirstCacheLevelAssociativity("polly-target-1st-cache-level-associativity", cl::desc("The associativity of the first cache level."), cl::Hidden, cl::init(-1), cl::cat(PollyCategory))
static cl::opt< int > SecondCacheLevelDefaultSize("polly-target-2nd-cache-level-default-size", cl::desc("The default size of the second cache level specified in bytes" " (if not enough were provided by the TargetTransformInfo)."), cl::Hidden, cl::init(262144), cl::cat(PollyCategory))
static cl::opt< int > PollyPatternMatchingNcQuotient("polly-pattern-matching-nc-quotient", cl::desc("Quotient that is obtained by dividing Nc, the parameter of the" "macro-kernel, by Nr, the parameter of the micro-kernel"), cl::Hidden, cl::init(256), cl::cat(PollyCategory))
static cl::opt< bool > PMBasedTCOpts("polly-tc-opt", cl::desc("Perform optimizations of tensor contractions based " "on pattern matching"), cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory))
static cl::opt< int > FirstCacheLevelSize("polly-target-1st-cache-level-size", cl::desc("The size of the first cache level specified in bytes."), cl::Hidden, cl::init(-1), cl::cat(PollyCategory))
static cl::opt< int > ThroughputVectorFma("polly-target-throughput-vector-fma", cl::desc("A throughput of the processor floating-point arithmetic units " "expressed in the number of vector fused multiply-add " "instructions per clock cycle."), cl::Hidden, cl::init(1), cl::cat(PollyCategory))
static cl::opt< int > SecondCacheLevelSize("polly-target-2nd-cache-level-size", cl::desc("The size of the second level specified in bytes."), cl::Hidden, cl::init(-1), cl::cat(PollyCategory))
static cl::opt< int > FirstCacheLevelDefaultAssociativity("polly-target-1st-cache-level-default-associativity", cl::desc("The default associativity of the first cache level" " (if not enough were provided by the TargetTransformInfo)."), cl::Hidden, cl::init(8), cl::cat(PollyCategory))
static cl::opt< int > SecondCacheLevelAssociativity("polly-target-2nd-cache-level-associativity", cl::desc("The associativity of the second cache level."), cl::Hidden, cl::init(-1), cl::cat(PollyCategory))
static cl::opt< int > VectorRegisterBitwidth("polly-target-vector-register-bitwidth", cl::desc("The size in bits of a vector register (if not set, this " "information is taken from LLVM's target information."), cl::Hidden, cl::init(-1), cl::cat(PollyCategory))
static cl::opt< int > LatencyVectorFma("polly-target-latency-vector-fma", cl::desc("The minimal number of cycles between issuing two " "dependent consecutive vector fused multiply-add " "instructions."), cl::Hidden, cl::init(8), cl::cat(PollyCategory))
static cl::opt< int > OptComputeOut("polly-tc-dependences-computeout", cl::desc("Bound the dependence analysis by a maximal amount of " "computational steps (0 means no bound)"), cl::Hidden, cl::init(500000), cl::ZeroOrMore, cl::cat(PollyCategory))
llvm::cl::OptionCategory PollyCategory
__isl_give isl_set * isl_set_from_pw_multi_aff(__isl_take isl_pw_multi_aff *pma)
__isl_give isl_pw_multi_aff * isl_pw_multi_aff_from_set(__isl_take isl_set *set)
struct isl_pw_multi_aff isl_pw_multi_aff
static isl::id alloc(isl::ctx ctx, const std::string &name, void *user)
isl::map equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const
static isl::map universe(isl::space space)
isl::id get_tuple_id(isl::dim type) const
class size range_tuple_dim() const
isl::map set_tuple_id(isl::dim type, isl::id id) const
isl::map fix_si(isl::dim type, unsigned int pos, int value) const
isl::map intersect_range(isl::set set) const
isl::map apply_range(isl::map map2) const
static isl::map from_union_map(isl::union_map umap)
boolean is_equal(const isl::map &map2) const
isl::map apply_domain(isl::map map2) const
isl::map range_product(isl::map map2) const
class size dim(isl::dim type) const
isl::space get_space() const
__isl_keep isl_map * get() const
isl::map move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
isl::map intersect_domain(isl::set set) const
isl::map project_out(isl::dim type, unsigned int first, unsigned int n) const
boolean has_tuple_id(isl::dim type) const
__isl_give isl_map * copy() const &
__isl_give isl_pw_multi_aff * copy() const &
isl::multi_pw_aff add(const isl::multi_pw_aff &multi2) const
__isl_give isl_pw_multi_aff * release()
isl::multi_pw_aff sub(const isl::multi_pw_aff &multi2) const
isl::union_set domain() const
isl::union_set get_universe_domain() const
class size get_schedule_depth() const
isl::schedule_node insert_mark(isl::id mark) const
isl::schedule_node child(int pos) const
__isl_give isl_schedule_node * release()
isl::schedule_node insert_partial_schedule(isl::multi_union_pw_aff schedule) const
isl::union_map get_prefix_schedule_relation() const
__isl_give isl_schedule_node * copy() const &
isl::union_map get_prefix_schedule_union_map() const
isl::schedule_node graft_before(isl::schedule_node graft) const
static isl::schedule_node from_extension(isl::union_map extension)
isl::schedule_node parent() const
__isl_keep isl_schedule_node * get() const
isl::set project_out(isl::dim type, unsigned int first, unsigned int n) const
isl::set intersect(isl::set set2) const
isl::set subtract(isl::set set2) const
static isl::set universe(isl::space space)
isl::set fix_si(isl::dim type, unsigned int pos, int value) const
__isl_give isl_set * copy() const &
boolean is_subset(const isl::set &set2) const
class size tuple_dim() const
isl::space get_space() const
class size dim(isl::dim type) const
isl::set add_dims(isl::dim type, unsigned int n) const
isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const
boolean is_equal(const isl::set &set2) const
isl::space map_from_domain_and_range(isl::space range) const
isl::space domain() const
class size dim(isl::dim type) const
isl::map extract_map(isl::space space) const
isl::union_map unite(isl::union_map umap2) const
isl::union_set unite(isl::union_set uset2) const
__isl_give isl_val * release()
The accumulated dependence information for a SCoP.
isl::union_map getDependences(int Kinds) const
Get the dependences of type Kinds.
isl::map getLatestAccessRelation() const
Return the newest access relation of this access.
bool isLatestArrayKind() const
Whether storage memory is either an custom .s2a/.phiops alloca (false) or an existing pointer into an...
bool isWrite() const
Is this a write memory access?
bool isRead() const
Is this a read memory access?
Type * getElementType() const
Return the element type of the accessed array wrt. this access.
ScopStmt * getStatement() const
Get the statement that contains this memory access.
void setNewAccessRelation(isl::map NewAccessRelation)
Set the updated access relation read from JSCOP file.
const SCEV * getDimensionSize(unsigned Dim) const
Return the size of dimension dim as SCEV*.
static const ScopArrayInfo * getFromId(isl::id Id)
Access the ScopArrayInfo associated with an isl Id.
isl::id getBasePtrId() const
Return the isl id for the base pointer.
isl::id getDomainId() const
Get the id of the iteration domain space.
bool isRegionStmt() const
Return true if this statement represents a whole region.
isl::set getDomain() const
Get the iteration domain of this ScopStmt.
void addScopStmt(BasicBlock *BB, StringRef Name, Loop *SurroundingLoop, std::vector< Instruction * > Instructions)
Create a new SCoP statement for BB.
ScopArrayInfo * createScopArrayInfo(Type *ElementType, const std::string &BaseName, const std::vector< unsigned > &Sizes)
Create an array and return the corresponding ScopArrayInfo object.
Function & getFunction() const
Return the function this SCoP is in.
enum isl_schedule_node_type isl_schedule_node_get_type(__isl_keep isl_schedule_node *node)
boolean manage(isl_bool val)
llvm::SmallVector< MemoryAccess *, 32 > getAccessesInOrder(ScopStmt &Stmt)
Return a vector that contains MemoryAccesses in the order in which they are executed.
@ Value
MemoryKind::Value: Models an llvm::Value.
isl::schedule_node applyRegisterTiling(isl::schedule_node Node, llvm::ArrayRef< int > TileSizes, int DefaultTileSize)
Tile a schedule node and unroll point loops.
isl::val getConstant(isl::pw_aff PwAff, bool Max, bool Min)
If PwAff maps to a constant, return said constant.
isl::map makeIdentityMap(const isl::set &Set, bool RestrictDomain)
Construct an identity map for the given domain values.
llvm::iota_range< unsigned > rangeIslSize(unsigned Begin, isl::size End)
Check that End is valid and return an iterator from Begin to End.
isl::schedule_node tryOptimizeMatMulPattern(isl::schedule_node Node, const llvm::TargetTransformInfo *TTI, const Dependences *D)
Apply the BLIS matmul optimization pattern if possible.
isl::union_set getIsolateOptions(isl::set IsolateDomain, unsigned OutDimsNum)
Create an isl::union_set, which describes the isolate option based on IsolateDomain.
isl::schedule_node tileNode(isl::schedule_node Node, const char *Identifier, llvm::ArrayRef< int > TileSizes, int DefaultTileSize)
Tile a schedule node.
isl::union_set getDimOptions(isl::ctx Ctx, const char *Option)
Create an isl::union_set, which describes the specified option for the dimension of the current node.
llvm::APInt APIntFromVal(__isl_take isl_val *Val)
Translate isl_val to llvm::APInt.
isl::set getPartialTilePrefixes(isl::set ScheduleRange, int VectorWidth)
Build the desired set of partial tile prefixes.
__isl_export isl_size isl_schedule_node_band_n_member(__isl_keep isl_schedule_node *node)
__isl_export __isl_give isl_multi_union_pw_aff * isl_schedule_node_band_get_partial_schedule(__isl_keep isl_schedule_node *node)
__isl_export __isl_give isl_schedule_node * isl_schedule_node_band_split(__isl_take isl_schedule_node *node, int pos)
__isl_give isl_union_map * isl_schedule_node_band_get_partial_schedule_union_map(__isl_keep isl_schedule_node *node)
__isl_give isl_schedule_node * isl_schedule_node_delete(__isl_take isl_schedule_node *node)
@ isl_schedule_node_filter
@ isl_schedule_node_domain
__isl_give isl_set * isl_set_fix_val(__isl_take isl_set *set, enum isl_dim_type type, unsigned pos, __isl_take isl_val *v)
static TupleKindPtr Domain("Domain")
static std::vector< std::string > intersect(const std::vector< std::string > &v1, const std::vector< std::string > &v2)
isl_size isl_union_map_n_map(__isl_keep isl_union_map *umap)
isl_size isl_union_set_n_set(__isl_keep isl_union_set *uset)