File tree Expand file tree Collapse file tree 1 file changed +20
-0
lines changed Expand file tree Collapse file tree 1 file changed +20
-0
lines changed Original file line number Diff line number Diff line change @@ -1644,6 +1644,26 @@ walk_order_t compute_walk_order(const config_t &cfg) {
16441644 size_t ab_bytes = get_memory_footprint (cfg, inner, outer);
16451645 if (ab_bytes <= l3_size) grid_inner = std::move (outer);
16461646 }
1647+
1648+ // Prefer square spatial dimensions to increase cache reuse due to iteration
1649+ // over kernel spatial dimensions. This optimization can likely be extended
1650+ // to bwd_d as well, it just hasn't been analyzed yet.
1651+ if (cfg.prb ().is_fwd ) {
1652+ auto &w_inner = grid_inner[pvars::ow];
1653+ auto &h_inner = grid_inner[pvars::oh];
1654+ auto rebalance_hw = [&]() {
1655+ if (grid_tile[pvars::oh] % (h_inner * 2 )) return false ;
1656+ if (w_inner % 2 ) return false ;
1657+ if (w_inner < h_inner * 4 ) return false ;
1658+ return true ;
1659+ };
1660+
1661+ while (rebalance_hw ()) {
1662+ w_inner /= 2 ;
1663+ h_inner *= 2 ;
1664+ }
1665+ }
1666+
16471667 // Add the blocks in this order:
16481668 // - Step 1. Add grid_inner blocks (fitting L3 cache)
16491669 // - Step 2. Add the remaining M/N blocks
You can’t perform that action at this time.
0 commit comments