Skip to content

Commit 678a691

Browse files
sgizlerjbylicki
authored andcommitted
Make summations autovectorizable
Signed-off-by: Szymon Gizler <sgizler@antmicro.com>
1 parent 457c273 commit 678a691

3 files changed

Lines changed: 62 additions & 40 deletions

File tree

src/gpl2/src/densityOp.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "densityOp.h"
3636

3737
#include <Kokkos_Core.hpp>
38+
#include "kokkosUtil.h"
3839

3940
#include "placerBase.h"
4041
#include "placerObjects.h"
@@ -292,10 +293,10 @@ void DensityOp::updateDensityForceBin()
292293

293294
sumOverflow_ = 0.0;
294295
Kokkos::DefaultHostExecutionSpace hostSpace;
295-
auto hBinOverflowArea = Kokkos::create_mirror_view_and_copy(hostSpace, dBinOverflowArea);
296-
for(int i = 0; i<numBins; ++i) {
297-
sumOverflow_ += hBinOverflowArea[i];
298-
}
296+
297+
Kokkos::View<float*> hBinOverflowArea("hBinOverflowArea", dBinOverflowArea.size());
298+
Kokkos::deep_copy(hBinOverflowArea, dBinOverflowArea);
299+
sumOverflow_ = sumFloats(hBinOverflowArea, numBins);
299300

300301
Kokkos::fence();
301302

src/gpl2/src/kokkosUtil.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,51 @@ KOKKOS_INLINE_FUNCTION float consistentCosf(float x) {
8181
KOKKOS_INLINE_FUNCTION float consistentExpf(float x) {
8282
return exp((double) x);
8383
}
84+
85+
#ifdef KOKKOS_ENABLE_CUDA
86+
#define HOST_FUNCTION __host__
87+
#else
88+
#define HOST_FUNCTION KOKKOS_FUNCTION
89+
#endif
90+
91+
#ifdef KOKKOS_ENABLE_CUDA
92+
#define HOST_INLINE_FUNCTION inline __host__
93+
#else
94+
#define HOST_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
95+
#endif
96+
97+
// We can't use parallel_reduce as we would lose consisiency between platforms
98+
// In order to ensure consistency with as low performance penalty as possible, we do it with host-only functions
99+
// that are autovectorizable by compiler.
100+
HOST_INLINE_FUNCTION float sumFloats(const Kokkos::View<const float*> arr, size_t size) {
101+
float partialSums[4] = {0.0, 0.0, 0.0, 0.0};
102+
auto hArr = Kokkos::create_mirror_view_and_copy(Kokkos::DefaultHostExecutionSpace(), arr);
103+
for(int i = 0; i<size/4*4; i+=4) {
104+
partialSums[0] += hArr[i+0];
105+
partialSums[1] += hArr[i+1];
106+
partialSums[2] += hArr[i+2];
107+
partialSums[3] += hArr[i+3];
108+
}
109+
float leftover = 0.0;
110+
for(int i = size/4*4; i<size; ++i) {
111+
leftover += hArr[i];
112+
}
113+
return partialSums[0] + partialSums[1] + partialSums[2] + partialSums[3] + leftover;
114+
}
115+
116+
// More accurate version of sumFloats() that use double as accumulator. TODO: Consider using Kahan summation algorithm
117+
HOST_INLINE_FUNCTION float sumFloatsAccurate(const Kokkos::View<const float*> arr, size_t size) {
118+
auto hArr = Kokkos::create_mirror_view_and_copy(Kokkos::DefaultHostExecutionSpace(), arr);
119+
double partialSums[4] = {0.0, 0.0, 0.0, 0.0};
120+
for(int i = 0; i<size/4*4; i+=4) {
121+
partialSums[0] += hArr[i+0];
122+
partialSums[1] += hArr[i+1];
123+
partialSums[2] += hArr[i+2];
124+
partialSums[3] += hArr[i+3];
125+
}
126+
double leftover = 0.0;
127+
for(int i = size/4*4; i<size; ++i) {
128+
leftover += hArr[i];
129+
}
130+
return partialSums[0] + partialSums[1] + partialSums[2] + partialSums[3] + leftover;
131+
}

src/gpl2/src/placerBase.cpp

Lines changed: 9 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
#include <unordered_set>
4949

5050
#include "db_sta/dbNetwork.hh"
51+
#include "kokkosUtil.h"
5152
#include "placerObjects.h"
5253
#include "sta/Liberty.hh"
5354
#include "utl/Logger.h"
@@ -58,13 +59,6 @@ using utl::GPL2;
5859

5960
#define REPLACE_SQRT2 1.414213562373095048801L
6061

61-
#ifdef KOKKOS_ENABLE_CUDA
62-
#define BACKEND_DEPENDENT_FUNCTION __host__
63-
#else
64-
#define BACKEND_DEPENDENT_FUNCTION KOKKOS_FUNCTION
65-
#endif
66-
67-
6862
///////////////////////////////////////////////////////////////////////////////////
6963
// PlacerBaseVars
7064
///////////////////////////////////////////////////////////////////////////////////
@@ -844,17 +838,14 @@ void PlacerBase::initDensity1()
844838

845839
// (a) // (a) define the get distance method
846840
// getDistance is only defined on the host side
847-
BACKEND_DEPENDENT_FUNCTION float getDistance(const Kokkos::View<const FloatPoint*>& a,
841+
HOST_FUNCTION float getDistance(const Kokkos::View<const FloatPoint*>& a,
848842
const Kokkos::View<const FloatPoint*>& b,
849843
const int numInsts)
850844
{
851845
if (numInsts <= 0) {
852846
return 0.0;
853847
}
854848

855-
float sumDistance = 0.0;
856-
Kokkos::DefaultHostExecutionSpace hostSpace;
857-
858849
auto aPlusbDistance = Kokkos::View<float*, Kokkos::DefaultExecutionSpace>("aPlusbDistance", numInsts);
859850
Kokkos::parallel_for(numInsts, KOKKOS_LAMBDA (const int i) {
860851
const FloatPoint& aPoint = a[i];
@@ -864,11 +855,7 @@ BACKEND_DEPENDENT_FUNCTION float getDistance(const Kokkos::View<const FloatPoint
864855
aPlusbDistance[i] = aDistance + bDistance;
865856
});
866857

867-
auto haPlusbDistance = Kokkos::create_mirror_view_and_copy(hostSpace, aPlusbDistance);
868-
for(int i = 0; i<numInsts; ++i) {
869-
sumDistance += haPlusbDistance[i];
870-
}
871-
858+
float sumDistance = sumFloats(aPlusbDistance, numInsts);
872859
return std::sqrt(sumDistance / (2.0 * numInsts));
873860
}
874861

@@ -881,19 +868,6 @@ struct myAbs
881868
}
882869
};
883870

884-
BACKEND_DEPENDENT_FUNCTION float getAbsGradSum(const Kokkos::View<const float*>& a, const int numInsts)
885-
{
886-
Kokkos::DefaultHostExecutionSpace hostSpace;
887-
auto hA = Kokkos::create_mirror_view_and_copy(hostSpace, a);
888-
889-
double sumAbs = 0.0;
890-
for(int i = 0; i<numInsts; ++i) {
891-
double x = hA[i];
892-
sumAbs += x;
893-
}
894-
return sumAbs;
895-
}
896-
897871
float PlacerBase::getStepLength(const Kokkos::View<const FloatPoint*>& prevSLPCoordi,
898872
const Kokkos::View<const FloatPoint*>& prevSLPSumGrads,
899873
const Kokkos::View<const FloatPoint*>& curSLPCoordi,
@@ -1007,15 +981,14 @@ void PlacerBase::updateGradients(const Kokkos::View<float*>& wireLengthGradients
1007981
densityGradSum_ = 0;
1008982

1009983
// get the forces on each instance
1010-
Kokkos::View<float*> wirelenabsGradXPlusY("absGradXPlusY", numInsts_);
1011-
Kokkos::View<float*> densityabsGradXPlusY("absGradXPlusY", numInsts_);
1012-
Kokkos::DefaultHostExecutionSpace hostSpace;
984+
Kokkos::View<float*> wireLengthGradAbsXPlusY("wireLengthGradAbsXPlusY", numInsts_);
985+
Kokkos::View<float*> densityGradAbsXPlusY("densityGradAbsXPlusY", numInsts_);
1013986

1014-
getWireLengthGradientWA(wireLengthGradientsX, wireLengthGradientsY, wirelenabsGradXPlusY);
1015-
getDensityGradient(densityGradientsX, densityGradientsY, densityabsGradXPlusY);
987+
getWireLengthGradientWA(wireLengthGradientsX, wireLengthGradientsY, wireLengthGradAbsXPlusY);
988+
getDensityGradient(densityGradientsX, densityGradientsY, densityGradAbsXPlusY);
1016989

1017-
wireLengthGradSum_ += getAbsGradSum(wirelenabsGradXPlusY, numInsts_);
1018-
densityGradSum_ += getAbsGradSum(densityabsGradXPlusY, numInsts_);
990+
wireLengthGradSum_ += sumFloatsAccurate(wireLengthGradAbsXPlusY, numInsts_);
991+
densityGradSum_ += sumFloatsAccurate(densityGradAbsXPlusY, numInsts_);
1019992

1020993
sumGradientKernel(numInsts_,
1021994
densityPenalty_,

0 commit comments

Comments
 (0)