|
25 | 25 | #ifndef __DF_REGRESSION_TRAIN_DENSE_DEFAULT_IMPL_I__ |
26 | 26 | #define __DF_REGRESSION_TRAIN_DENSE_DEFAULT_IMPL_I__ |
27 | 27 |
|
| 28 | +#include <limits> |
| 29 | + |
28 | 30 | #include "src/algorithms/dtrees/forest/df_train_dense_default_impl.i" |
29 | 31 | #include "src/algorithms/dtrees/forest/regression/df_regression_train_kernel.h" |
30 | 32 | #include "src/algorithms/dtrees/forest/regression/df_regression_model_impl.h" |
@@ -168,54 +170,207 @@ protected: |
168 | 170 | mutable TVector<intermSummFPType, cpu, DefaultAllocator<cpu> > _weightsFeatureBuf; |
169 | 171 | }; |
170 | 172 |
|
| 173 | +// For details about the vectorized version, see the wikipedia article: |
| 174 | +// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm |
171 | 175 | template <typename algorithmFPType, CpuType cpu> |
172 | 176 | template <bool noWeights> |
173 | 177 | void OrderedRespHelperBest<algorithmFPType, cpu>::calcImpurity(const IndexType * aIdx, size_t n, ImpurityData & imp, |
174 | 178 | intermSummFPType & totalWeights) const |
175 | 179 | { |
| 180 | +#if (__CPUID__(DAAL_CPU) == __avx512__) |
| 181 | + constexpr const size_t simdBatchSize = 8; |
| 182 | + constexpr const size_t minObsVectorizedPath = 32; |
| 183 | +#elif (__CPUID__(DAAL_CPU) == __avx2__) |
| 184 | + constexpr const size_t simdBatchSize = 4; |
| 185 | + constexpr const size_t minObsVectorizedPath = 32; |
| 186 | +#else |
| 187 | + constexpr const size_t simdBatchSize = 1; |
| 188 | + constexpr const size_t minObsVectorizedPath = std::numeric_limits<size_t>::max(); |
| 189 | +#endif |
176 | 190 | if (noWeights) |
177 | 191 | { |
178 | | - imp.var = 0; |
179 | | - imp.mean = this->_aResponse[aIdx[0]].val; |
| 192 | + if (n < minObsVectorizedPath) |
| 193 | + { |
| 194 | + imp.var = 0; |
| 195 | + imp.mean = this->_aResponse[aIdx[0]].val; |
| 196 | + |
| 197 | + for (size_t i = 1; i < n; ++i) |
| 198 | + { |
| 199 | + const intermSummFPType y = this->_aResponse[aIdx[i]].val; |
| 200 | + const intermSummFPType delta = y - imp.mean; |
| 201 | + imp.mean += delta / static_cast<intermSummFPType>(i + 1); |
| 202 | + imp.var += delta * (y - imp.mean); |
| 203 | + } |
| 204 | + totalWeights = static_cast<intermSummFPType>(n); |
| 205 | + imp.var /= totalWeights; //impurity is MSE |
| 206 | + } |
180 | 207 |
|
181 | | - for (size_t i = 1; i < n; ++i) |
| 208 | + else |
182 | 209 | { |
183 | | - const intermSummFPType delta = this->_aResponse[aIdx[i]].val - imp.mean; //x[i] - mean |
184 | | - imp.mean += delta / static_cast<intermSummFPType>(i + 1); |
185 | | - imp.var += delta * (this->_aResponse[aIdx[i]].val - imp.mean); |
| 210 | + intermSummFPType means[simdBatchSize] = { 0 }; |
| 211 | + intermSummFPType sumsOfSquares[simdBatchSize] = { 0 }; |
| 212 | + intermSummFPType yBatch[simdBatchSize]; |
| 213 | + |
| 214 | + const size_t itersSimdLoop = n / simdBatchSize; |
| 215 | + const size_t sizeSimdLoop = itersSimdLoop * simdBatchSize; |
| 216 | + |
| 217 | + for (size_t iMain = 0; iMain < itersSimdLoop; iMain++) |
| 218 | + { |
| 219 | + const size_t iStart = iMain * simdBatchSize; |
| 220 | + const auto aIdxStart = aIdx + iStart; |
| 221 | + const intermSummFPType mult = 1.0 / static_cast<intermSummFPType>(iMain + 1); |
| 222 | + |
| 223 | + // Pack the responses into a continuous memory block |
| 224 | + PRAGMA_OMP_SIMD_ARGS(simdlen(simdBatchSize)) |
| 225 | + for (size_t iSub = 0; iSub < simdBatchSize; iSub++) |
| 226 | + { |
| 227 | + yBatch[iSub] = this->_aResponse[aIdxStart[iSub]].val; |
| 228 | + } |
| 229 | + |
| 230 | + // Update vector of partial means and sum of squares using an incremental algorithm |
| 231 | + PRAGMA_OMP_SIMD |
| 232 | + for (size_t iSub = 0; iSub < simdBatchSize; iSub++) |
| 233 | + { |
| 234 | + const intermSummFPType y = yBatch[iSub]; |
| 235 | + intermSummFPType meanBatch = means[iSub]; |
| 236 | + const intermSummFPType delta = y - meanBatch; |
| 237 | + meanBatch += delta * mult; |
| 238 | + sumsOfSquares[iSub] += delta * (y - meanBatch); |
| 239 | + means[iSub] = meanBatch; |
| 240 | + } |
| 241 | + } |
| 242 | + |
| 243 | + imp.mean = means[0]; |
| 244 | + imp.var = sumsOfSquares[0]; |
| 245 | + intermSummFPType var_deltas = 0; |
| 246 | + // Compute means and sum of squares for the first `sizeSimdLoop` responses |
| 247 | + for (size_t i = 1; i < simdBatchSize; i++) |
| 248 | + { |
| 249 | + const intermSummFPType delta = means[i] - imp.mean; |
| 250 | + const intermSummFPType div = 1.0 / static_cast<intermSummFPType>(i + 1); |
| 251 | + imp.mean += delta * div; |
| 252 | + imp.var += sumsOfSquares[i]; |
| 253 | + var_deltas += (delta * delta) * (static_cast<intermSummFPType>(i) * div); |
| 254 | + } |
| 255 | + imp.var += var_deltas * itersSimdLoop; |
| 256 | + |
| 257 | + // Process tail elements, if any |
| 258 | + for (size_t i = sizeSimdLoop; i < n; i++) |
| 259 | + { |
| 260 | + const intermSummFPType y = this->_aResponse[aIdx[i]].val; |
| 261 | + const intermSummFPType delta = y - imp.mean; |
| 262 | + imp.mean += delta / static_cast<intermSummFPType>(i + 1); |
| 263 | + imp.var += delta * (y - imp.mean); |
| 264 | + } |
| 265 | + totalWeights = static_cast<intermSummFPType>(n); |
| 266 | + imp.var /= totalWeights; |
186 | 267 | } |
187 | | - totalWeights = static_cast<intermSummFPType>(n); |
188 | | - imp.var /= totalWeights; //impurity is MSE |
189 | 268 | } |
190 | 269 | else |
191 | 270 | { |
192 | | - imp.mean = 0; |
193 | | - imp.var = 0; |
194 | | - totalWeights = 0; |
195 | | - |
196 | | - // Note: weights can be exactly zero, in which case they'd break the division. |
197 | | - // Since zero-weight observations have no impact on the results, this tries to |
198 | | - // look for the first non-zero weight. |
199 | | - size_t i_start; |
200 | | - for (i_start = 0; i_start < n && !this->_aWeights[aIdx[i_start]].val; i_start++) |
201 | | - ; |
202 | | - |
203 | | - for (size_t i = i_start; i < n; ++i) |
| 271 | + if (n < minObsVectorizedPath) |
| 272 | + { |
| 273 | + imp.mean = 0; |
| 274 | + imp.var = 0; |
| 275 | + totalWeights = 0; |
| 276 | + |
| 277 | + // Note: weights can be exactly zero, in which case they'd break the division. |
| 278 | + // Since zero-weight observations have no impact on the results, this tries to |
| 279 | + // look for the first non-zero weight. |
| 280 | + size_t iStart; |
| 281 | + for (iStart = 0; iStart < n && !this->_aWeights[aIdx[iStart]].val; iStart++) |
| 282 | + ; |
| 283 | + |
| 284 | + for (size_t i = iStart; i < n; ++i) |
| 285 | + { |
| 286 | + const intermSummFPType weights = this->_aWeights[aIdx[i]].val; |
| 287 | + const intermSummFPType y = this->_aResponse[aIdx[i]].val; |
| 288 | + const intermSummFPType delta = y - imp.mean; |
| 289 | + totalWeights += weights; |
| 290 | + imp.mean += delta * (weights / totalWeights); |
| 291 | + imp.var += weights * delta * (y - imp.mean); |
| 292 | + } |
| 293 | + if (totalWeights) imp.var /= totalWeights; //impurity is MSE |
| 294 | + } |
| 295 | + |
| 296 | + else |
204 | 297 | { |
205 | | - const intermSummFPType weights = this->_aWeights[aIdx[i]].val; |
206 | | - const intermSummFPType y = this->_aResponse[aIdx[i]].val; |
207 | | - const intermSummFPType delta = y - imp.mean; //x[i] - mean |
208 | | - totalWeights += weights; |
209 | | - imp.mean += delta * (weights / totalWeights); |
210 | | - imp.var += weights * delta * (y - imp.mean); |
| 298 | + intermSummFPType means[simdBatchSize] = { 0 }; |
| 299 | + intermSummFPType sumsOfSquares[simdBatchSize] = { 0 }; |
| 300 | + intermSummFPType sumsOfWeights[simdBatchSize] = { 0 }; |
| 301 | + intermSummFPType yBatch[simdBatchSize]; |
| 302 | + intermSummFPType weightsBatch[simdBatchSize]; |
| 303 | + |
| 304 | + const size_t itersSimdLoop = n / simdBatchSize; |
| 305 | + const size_t sizeSimdLoop = itersSimdLoop * simdBatchSize; |
| 306 | + |
| 307 | + for (size_t iMain = 0; iMain < itersSimdLoop; iMain++) |
| 308 | + { |
| 309 | + const size_t iStart = iMain * simdBatchSize; |
| 310 | + const auto aIdxStart = aIdx + iStart; |
| 311 | + |
| 312 | + // Pack the data from indices into contiguous blocks |
| 313 | + PRAGMA_OMP_SIMD_ARGS(simdlen(simdBatchSize)) |
| 314 | + for (size_t iSub = 0; iSub < simdBatchSize; iSub++) |
| 315 | + { |
| 316 | + yBatch[iSub] = this->_aResponse[aIdxStart[iSub]].val; |
| 317 | + weightsBatch[iSub] = this->_aWeights[aIdxStart[iSub]].val; |
| 318 | + } |
| 319 | + |
| 320 | + // Update the vectors of means, variances, and cumulative weights |
| 321 | + PRAGMA_OMP_SIMD |
| 322 | + for (size_t iSub = 0; iSub < simdBatchSize; iSub++) |
| 323 | + { |
| 324 | + const intermSummFPType y = yBatch[iSub]; |
| 325 | + const intermSummFPType weight = weightsBatch[iSub]; |
| 326 | + sumsOfWeights[iSub] += weight; |
| 327 | + |
| 328 | + intermSummFPType meanBatch = means[iSub]; |
| 329 | + const intermSummFPType delta = y - meanBatch; |
| 330 | + meanBatch += sumsOfWeights[iSub] ? (delta * (weight / sumsOfWeights[iSub])) : 0; |
| 331 | + sumsOfSquares[iSub] += weight * (delta * (y - meanBatch)); |
| 332 | + means[iSub] = meanBatch; |
| 333 | + } |
| 334 | + } |
| 335 | + |
| 336 | + // Merge the aggregates |
| 337 | + imp.mean = means[0]; |
| 338 | + imp.var = sumsOfSquares[0]; |
| 339 | + totalWeights = sumsOfWeights[0]; |
| 340 | + intermSummFPType var_deltas = 0; |
| 341 | + size_t iBatch; |
| 342 | + for (iBatch = 1; iBatch < simdBatchSize && !sumsOfWeights[iBatch]; iBatch++) |
| 343 | + ; |
| 344 | + for (; iBatch < simdBatchSize; iBatch++) |
| 345 | + { |
| 346 | + const intermSummFPType weightNew = sumsOfWeights[iBatch]; |
| 347 | + const intermSummFPType weightLeft = totalWeights; |
| 348 | + totalWeights += weightNew; |
| 349 | + const intermSummFPType fractionRight = weightNew / totalWeights; |
| 350 | + const intermSummFPType delta = means[iBatch] - imp.mean; |
| 351 | + imp.mean += delta * fractionRight; |
| 352 | + imp.var += sumsOfSquares[iBatch]; |
| 353 | + var_deltas += (delta * delta) * (weightLeft * fractionRight); |
| 354 | + } |
| 355 | + imp.var += var_deltas; |
| 356 | + |
| 357 | + // Process tail elements, if any |
| 358 | + size_t i; |
| 359 | + for (i = sizeSimdLoop; i < n && !this->_aWeights[aIdx[i]].val; i++) |
| 360 | + ; |
| 361 | + for (; i < n; i++) |
| 362 | + { |
| 363 | + const intermSummFPType weight = this->_aWeights[aIdx[i]].val; |
| 364 | + const intermSummFPType y = this->_aResponse[aIdx[i]].val; |
| 365 | + const intermSummFPType delta = y - imp.mean; |
| 366 | + totalWeights += weight; |
| 367 | + imp.mean += delta * (weight / totalWeights); |
| 368 | + imp.var += weight * delta * (y - imp.mean); |
| 369 | + } |
| 370 | + if (totalWeights) imp.var /= totalWeights; |
211 | 371 | } |
212 | | - if (totalWeights) imp.var /= totalWeights; //impurity is MSE |
213 | 372 | } |
214 | 373 |
|
215 | | -// Note: the debug checks throughout this file are always done in float64 precision regardless |
216 | | -// of the input data type, as otherwise they can get too inaccurate when sample sizes are more |
217 | | -// than a few million rows, up to the point where the debug calculation would be less precise |
218 | | -// than the shorthand non-debug calculation. |
219 | 374 | #ifdef DEBUG_CHECK_IMPURITY |
220 | 375 | if (!this->_weights) |
221 | 376 | { |
|
0 commit comments