Skip to content

Commit 97f37b5

Browse files
ENH: Vectorization for forest mean/variance calculations (#3362)
* vectorization for mean/variance calcualtions * name key variables * see if reuse in extra-trees is causing failures * fixes * fix remaining merge conflicts * fix remaining merge conflicts * yet more fixes * fix * use camel case * linter * more standardization * solve merge conflicts * control vectorization width by instruction set * correct header * use pragma wrappers * Update cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i Co-authored-by: Victoriya Fedotova <[email protected]> * Update cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i Co-authored-by: Victoriya Fedotova <[email protected]> * Update cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i Co-authored-by: Victoriya Fedotova <[email protected]> * Update cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i Co-authored-by: Victoriya Fedotova <[email protected]> * add comments for weighted version --------- Co-authored-by: Victoriya Fedotova <[email protected]>
1 parent c55a231 commit 97f37b5

File tree

1 file changed

+186
-31
lines changed

1 file changed

+186
-31
lines changed

cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i

Lines changed: 186 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#ifndef __DF_REGRESSION_TRAIN_DENSE_DEFAULT_IMPL_I__
2626
#define __DF_REGRESSION_TRAIN_DENSE_DEFAULT_IMPL_I__
2727

28+
#include <limits>
29+
2830
#include "src/algorithms/dtrees/forest/df_train_dense_default_impl.i"
2931
#include "src/algorithms/dtrees/forest/regression/df_regression_train_kernel.h"
3032
#include "src/algorithms/dtrees/forest/regression/df_regression_model_impl.h"
@@ -168,54 +170,207 @@ protected:
168170
mutable TVector<intermSummFPType, cpu, DefaultAllocator<cpu> > _weightsFeatureBuf;
169171
};
170172

173+
// For details about the vectorized version, see the wikipedia article:
174+
// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
171175
template <typename algorithmFPType, CpuType cpu>
172176
template <bool noWeights>
173177
void OrderedRespHelperBest<algorithmFPType, cpu>::calcImpurity(const IndexType * aIdx, size_t n, ImpurityData & imp,
174178
intermSummFPType & totalWeights) const
175179
{
180+
#if (__CPUID__(DAAL_CPU) == __avx512__)
181+
constexpr const size_t simdBatchSize = 8;
182+
constexpr const size_t minObsVectorizedPath = 32;
183+
#elif (__CPUID__(DAAL_CPU) == __avx2__)
184+
constexpr const size_t simdBatchSize = 4;
185+
constexpr const size_t minObsVectorizedPath = 32;
186+
#else
187+
constexpr const size_t simdBatchSize = 1;
188+
constexpr const size_t minObsVectorizedPath = std::numeric_limits<size_t>::max();
189+
#endif
176190
if (noWeights)
177191
{
178-
imp.var = 0;
179-
imp.mean = this->_aResponse[aIdx[0]].val;
192+
if (n < minObsVectorizedPath)
193+
{
194+
imp.var = 0;
195+
imp.mean = this->_aResponse[aIdx[0]].val;
196+
197+
for (size_t i = 1; i < n; ++i)
198+
{
199+
const intermSummFPType y = this->_aResponse[aIdx[i]].val;
200+
const intermSummFPType delta = y - imp.mean;
201+
imp.mean += delta / static_cast<intermSummFPType>(i + 1);
202+
imp.var += delta * (y - imp.mean);
203+
}
204+
totalWeights = static_cast<intermSummFPType>(n);
205+
imp.var /= totalWeights; //impurity is MSE
206+
}
180207

181-
for (size_t i = 1; i < n; ++i)
208+
else
182209
{
183-
const intermSummFPType delta = this->_aResponse[aIdx[i]].val - imp.mean; //x[i] - mean
184-
imp.mean += delta / static_cast<intermSummFPType>(i + 1);
185-
imp.var += delta * (this->_aResponse[aIdx[i]].val - imp.mean);
210+
intermSummFPType means[simdBatchSize] = { 0 };
211+
intermSummFPType sumsOfSquares[simdBatchSize] = { 0 };
212+
intermSummFPType yBatch[simdBatchSize];
213+
214+
const size_t itersSimdLoop = n / simdBatchSize;
215+
const size_t sizeSimdLoop = itersSimdLoop * simdBatchSize;
216+
217+
for (size_t iMain = 0; iMain < itersSimdLoop; iMain++)
218+
{
219+
const size_t iStart = iMain * simdBatchSize;
220+
const auto aIdxStart = aIdx + iStart;
221+
const intermSummFPType mult = 1.0 / static_cast<intermSummFPType>(iMain + 1);
222+
223+
// Pack the responses into a continuous memory block
224+
PRAGMA_OMP_SIMD_ARGS(simdlen(simdBatchSize))
225+
for (size_t iSub = 0; iSub < simdBatchSize; iSub++)
226+
{
227+
yBatch[iSub] = this->_aResponse[aIdxStart[iSub]].val;
228+
}
229+
230+
// Update vector of partial means and sum of squares using an incremental algorithm
231+
PRAGMA_OMP_SIMD
232+
for (size_t iSub = 0; iSub < simdBatchSize; iSub++)
233+
{
234+
const intermSummFPType y = yBatch[iSub];
235+
intermSummFPType meanBatch = means[iSub];
236+
const intermSummFPType delta = y - meanBatch;
237+
meanBatch += delta * mult;
238+
sumsOfSquares[iSub] += delta * (y - meanBatch);
239+
means[iSub] = meanBatch;
240+
}
241+
}
242+
243+
imp.mean = means[0];
244+
imp.var = sumsOfSquares[0];
245+
intermSummFPType var_deltas = 0;
246+
// Compute means and sum of squares for the first `sizeSimdLoop` responses
247+
for (size_t i = 1; i < simdBatchSize; i++)
248+
{
249+
const intermSummFPType delta = means[i] - imp.mean;
250+
const intermSummFPType div = 1.0 / static_cast<intermSummFPType>(i + 1);
251+
imp.mean += delta * div;
252+
imp.var += sumsOfSquares[i];
253+
var_deltas += (delta * delta) * (static_cast<intermSummFPType>(i) * div);
254+
}
255+
imp.var += var_deltas * itersSimdLoop;
256+
257+
// Process tail elements, if any
258+
for (size_t i = sizeSimdLoop; i < n; i++)
259+
{
260+
const intermSummFPType y = this->_aResponse[aIdx[i]].val;
261+
const intermSummFPType delta = y - imp.mean;
262+
imp.mean += delta / static_cast<intermSummFPType>(i + 1);
263+
imp.var += delta * (y - imp.mean);
264+
}
265+
totalWeights = static_cast<intermSummFPType>(n);
266+
imp.var /= totalWeights;
186267
}
187-
totalWeights = static_cast<intermSummFPType>(n);
188-
imp.var /= totalWeights; //impurity is MSE
189268
}
190269
else
191270
{
192-
imp.mean = 0;
193-
imp.var = 0;
194-
totalWeights = 0;
195-
196-
// Note: weights can be exactly zero, in which case they'd break the division.
197-
// Since zero-weight observations have no impact on the results, this tries to
198-
// look for the first non-zero weight.
199-
size_t i_start;
200-
for (i_start = 0; i_start < n && !this->_aWeights[aIdx[i_start]].val; i_start++)
201-
;
202-
203-
for (size_t i = i_start; i < n; ++i)
271+
if (n < minObsVectorizedPath)
272+
{
273+
imp.mean = 0;
274+
imp.var = 0;
275+
totalWeights = 0;
276+
277+
// Note: weights can be exactly zero, in which case they'd break the division.
278+
// Since zero-weight observations have no impact on the results, this tries to
279+
// look for the first non-zero weight.
280+
size_t iStart;
281+
for (iStart = 0; iStart < n && !this->_aWeights[aIdx[iStart]].val; iStart++)
282+
;
283+
284+
for (size_t i = iStart; i < n; ++i)
285+
{
286+
const intermSummFPType weights = this->_aWeights[aIdx[i]].val;
287+
const intermSummFPType y = this->_aResponse[aIdx[i]].val;
288+
const intermSummFPType delta = y - imp.mean;
289+
totalWeights += weights;
290+
imp.mean += delta * (weights / totalWeights);
291+
imp.var += weights * delta * (y - imp.mean);
292+
}
293+
if (totalWeights) imp.var /= totalWeights; //impurity is MSE
294+
}
295+
296+
else
204297
{
205-
const intermSummFPType weights = this->_aWeights[aIdx[i]].val;
206-
const intermSummFPType y = this->_aResponse[aIdx[i]].val;
207-
const intermSummFPType delta = y - imp.mean; //x[i] - mean
208-
totalWeights += weights;
209-
imp.mean += delta * (weights / totalWeights);
210-
imp.var += weights * delta * (y - imp.mean);
298+
intermSummFPType means[simdBatchSize] = { 0 };
299+
intermSummFPType sumsOfSquares[simdBatchSize] = { 0 };
300+
intermSummFPType sumsOfWeights[simdBatchSize] = { 0 };
301+
intermSummFPType yBatch[simdBatchSize];
302+
intermSummFPType weightsBatch[simdBatchSize];
303+
304+
const size_t itersSimdLoop = n / simdBatchSize;
305+
const size_t sizeSimdLoop = itersSimdLoop * simdBatchSize;
306+
307+
for (size_t iMain = 0; iMain < itersSimdLoop; iMain++)
308+
{
309+
const size_t iStart = iMain * simdBatchSize;
310+
const auto aIdxStart = aIdx + iStart;
311+
312+
// Pack the data from indices into contiguous blocks
313+
PRAGMA_OMP_SIMD_ARGS(simdlen(simdBatchSize))
314+
for (size_t iSub = 0; iSub < simdBatchSize; iSub++)
315+
{
316+
yBatch[iSub] = this->_aResponse[aIdxStart[iSub]].val;
317+
weightsBatch[iSub] = this->_aWeights[aIdxStart[iSub]].val;
318+
}
319+
320+
// Update the vectors of means, variances, and cumulative weights
321+
PRAGMA_OMP_SIMD
322+
for (size_t iSub = 0; iSub < simdBatchSize; iSub++)
323+
{
324+
const intermSummFPType y = yBatch[iSub];
325+
const intermSummFPType weight = weightsBatch[iSub];
326+
sumsOfWeights[iSub] += weight;
327+
328+
intermSummFPType meanBatch = means[iSub];
329+
const intermSummFPType delta = y - meanBatch;
330+
meanBatch += sumsOfWeights[iSub] ? (delta * (weight / sumsOfWeights[iSub])) : 0;
331+
sumsOfSquares[iSub] += weight * (delta * (y - meanBatch));
332+
means[iSub] = meanBatch;
333+
}
334+
}
335+
336+
// Merge the aggregates
337+
imp.mean = means[0];
338+
imp.var = sumsOfSquares[0];
339+
totalWeights = sumsOfWeights[0];
340+
intermSummFPType var_deltas = 0;
341+
size_t iBatch;
342+
for (iBatch = 1; iBatch < simdBatchSize && !sumsOfWeights[iBatch]; iBatch++)
343+
;
344+
for (; iBatch < simdBatchSize; iBatch++)
345+
{
346+
const intermSummFPType weightNew = sumsOfWeights[iBatch];
347+
const intermSummFPType weightLeft = totalWeights;
348+
totalWeights += weightNew;
349+
const intermSummFPType fractionRight = weightNew / totalWeights;
350+
const intermSummFPType delta = means[iBatch] - imp.mean;
351+
imp.mean += delta * fractionRight;
352+
imp.var += sumsOfSquares[iBatch];
353+
var_deltas += (delta * delta) * (weightLeft * fractionRight);
354+
}
355+
imp.var += var_deltas;
356+
357+
// Process tail elements, if any
358+
size_t i;
359+
for (i = sizeSimdLoop; i < n && !this->_aWeights[aIdx[i]].val; i++)
360+
;
361+
for (; i < n; i++)
362+
{
363+
const intermSummFPType weight = this->_aWeights[aIdx[i]].val;
364+
const intermSummFPType y = this->_aResponse[aIdx[i]].val;
365+
const intermSummFPType delta = y - imp.mean;
366+
totalWeights += weight;
367+
imp.mean += delta * (weight / totalWeights);
368+
imp.var += weight * delta * (y - imp.mean);
369+
}
370+
if (totalWeights) imp.var /= totalWeights;
211371
}
212-
if (totalWeights) imp.var /= totalWeights; //impurity is MSE
213372
}
214373

215-
// Note: the debug checks throughout this file are always done in float64 precision regardless
216-
// of the input data type, as otherwise they can get too inaccurate when sample sizes are more
217-
// than a few million rows, up to the point where the debug calculation would be less precise
218-
// than the shorthand non-debug calculation.
219374
#ifdef DEBUG_CHECK_IMPURITY
220375
if (!this->_weights)
221376
{

0 commit comments

Comments
 (0)