@@ -175,9 +175,14 @@ class GenerateCandidates {
175175
176176 // The number of A and B columns to read between updating `C`.
177177 SizeVec KC (size_t mr, MMOrder order) const {
178- // Must return the actual value: although ignored by `RangesOfKC`, this will
179- // be used in MC() and NC().
180- if (IsOneKC (order)) return SizeVec (1 , K_);
178+ if (IsOneKC (order)) {
179+ // A single KC range is infeasible when K exceeds the max. The caller
180+ // will skip all configs with `order`.
181+ if (K_ > kMaxKC ) return SizeVec ();
182+ // Must return the actual value: although ignored by `RangesOfKC`, this
183+ // will be used in MC() and NC().
184+ return SizeVec (1 , K_);
185+ }
181186 // `LoopKC` handles up to `mr` rows of A.
182187 const size_t rows_a = HWY_MIN (max_M_, mr);
183188
@@ -227,13 +232,21 @@ class GenerateCandidates {
227232
228233 // The number of (L2 resident) A rows for `A2C0` to loop over.
229234 SizeVec MC (size_t mr, size_t kc, MMOrder order) const {
230- // Must return the actual value: although ignored by `RangesOfMC`, this will
231- // be used in NC().
232- if (IsOneMC (order) || max_M_ <= mr) return SizeVec (1 , max_M_);
235+ if (max_M_ <= mr) return SizeVec (1 , max_M_);
236+ if (IsOneMC (order)) {
237+ // A single MC range is infeasible when M exceeds the max. The caller
238+ // will skip all configs with `order`.
239+ if (max_M_ > kMaxMC ) return SizeVec ();
240+ // Must return the actual value: although ignored by `RangesOfMC`, this
241+ // will be used in NC().
242+ return SizeVec (1 , max_M_);
243+ }
233244
234245 // Typically 12-24K. The B rows are pinned in L1, but also occupy L2 because
235246 // it is typically inclusive.
236247 const size_t bytes_b = kNR * kc * (sizeof (SfpStream) + sizeof (BF16));
248+ // `kc` was chosen to fit in L1, hence this should not exceed L2.
249+ HWY_ASSERT (bytes_b <= cache_.L2Bytes ());
237250
238251 // Choose the largest feasible `mc_max` (A/C rows) to maximize reuse of the
239252 // packed B. We want `mc * kc` elements of A to fit in L2, alongside
@@ -242,7 +255,7 @@ class GenerateCandidates {
242255 size_t mc_max = hwy::DivCeil (cache_.L2Bytes () - bytes_b, bytes_per_mc);
243256 mc_max = HWY_MIN (mc_max, HWY_MIN (kMaxBatchSize , kMaxMC ));
244257 mc_max = HWY_MIN (mc_max, max_M_);
245- HWY_DASSERT (mc_max != 0 );
258+ HWY_ASSERT (mc_max != 0 );
246259
247260 SizeVec all_mc;
248261 all_mc.reserve (6 );
0 commit comments