Skip to content

Commit 65cc07b

Browse files
committed
__restrict__
Slight performance win
1 parent ed63dca commit 65cc07b

File tree

3 files changed

+107
-53
lines changed

3 files changed

+107
-53
lines changed

AGENTS.md

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1195,7 +1195,7 @@ downstream packages (e.g., TreeSearch) can use `LinkingTo: TreeDist`:
11951195
`src/lap.h` is now a thin wrapper that includes `<TreeDist/lap.h>` and
11961196
re-exports types to global scope.
11971197

1198-
### LAPJV codegen regression (diagnosed, partially mitigated)
1198+
### LAPJV codegen regression (diagnosed, characterised, accepted)
11991199

12001200
Including `lap_impl.h` in `lap.cpp` changed the TU context enough for GCC 14's
12011201
register allocator to produce ~8% more instructions in the Dijkstra hot loop,
@@ -1213,9 +1213,58 @@ with `__attribute__((optimize("align-functions=64", "align-loops=16")))`.
12131213
The `lapjv()` wrapper fills the transposed buffer first (matching R's
12141214
column-major storage) then untransposes — restoring the cache-friendly pattern.
12151215

1216-
**Residual:** ~5–9% on LAPJV 1999×1999 vs main (from the CostMatrix class
1217-
definition difference). Tree distance metrics are unaffected — they call
1218-
`lap()` from `pairwise_distances.cpp`, whose TU context is unchanged.
1216+
**VTune profiling (vtune-lap/, 2026-04-01):** Profiled with `-O2 -g
1217+
-fno-omit-frame-pointer` on the current branch (LAPJV 1999×1999 ×30s +
1218+
400×400 ×30s + CID/MSD random 200-tip ×30s each).
1219+
1220+
| Function | CPU Time | % | Notes |
1221+
|---|---|---|---|
1222+
| `TreeDist::CostMatrix::findRowSubmin` | 23.8s | 19.8% | Inlined into lap() at -O2 (separate symbol with -g) |
1223+
| `TreeDist::lap` | 23.4s | 19.5% | Dijkstra phase dominates |
1224+
| `msd_score` | 9.4s | 7.8% | Contains inlined lap() |
1225+
| `mutual_clustering_score` | 9.3s | 7.7% | Contains inlined lap() |
1226+
| `lapjv` (R wrapper) | 6.0s | 5.0% | Matrix conversion overhead |
1227+
1228+
Source-line breakdown of `lap()`:
1229+
1230+
| Phase | Lines | CPU Time (s) | % of lap() |
1231+
|---|---|---|---|
1232+
| Dijkstra inner update loop | 308–325 | ~16.5 | 72% |
1233+
| Dijkstra min-scan | 274–287 | ~3.5 | 15% |
1234+
| Matrix fill (lapjv wrapper) | 61–65 | ~2.7 | 12% |
1235+
| findRowSubmin (reduction) | 173–184 | ~2.0 | 9% |
1236+
1237+
**Optimisation attempts:**
1238+
1239+
1. **`__restrict__` on Dijkstra pointers** (DONE, kept): Added restrict-qualified
1240+
local pointers (`d_ptr`, `pred_ptr`, `cl_ptr`) for the augment-solution phase
1241+
to eliminate potential alias reloads. Applied to both `lap.cpp` and `lap_impl.h`.
1242+
Benchmark: no measurable change in regression magnitude, but correct optimisation.
1243+
1244+
2. **`#pragma GCC optimize("O3")`** (attempted, reverted): Forced O3 for the entire
1245+
`lap.cpp` TU. Benchmark: no measurable improvement; same alignment pattern.
1246+
1247+
3. **Per-object Makevars flags** (attempted, failed): R's build system does not support
1248+
per-target variable overrides in `Makevars.win`.
1249+
1250+
**Residual regression (confirmed, alignment-dependent):**
1251+
1252+
| LAPJV size | dev/ref ratio | Direction |
1253+
|---|---|---|
1254+
| 400×400 | 0.81 | **Dev 19% faster** (alignment win) |
1255+
| 1999×1999 | 1.08–1.13 | **Dev 8–13% slower** (alignment loss) |
1256+
1257+
The regression is alignment-sensitive and manifests differently at different problem
1258+
sizes. The same codegen change that makes 400×400 faster makes 1999×1999 slower.
1259+
This is the classic instruction-alignment lottery: the Dijkstra inner loop's branch
1260+
prediction and instruction fetch are affected by code placement that varies with
1261+
TU context.
1262+
1263+
**Conclusion:** The regression cannot be reliably eliminated without matching main's
1264+
exact TU context (adding dead code back to the installed header, which is
1265+
unacceptable for CRAN). **Tree distance metrics are completely unaffected**
1266+
`lap()` is called from `pairwise_distances.cpp` (different TU context), and
1267+
benchmarks confirm neutral performance across all metrics and tree sizes.
12191268

12201269
**Maintenance note:** if the `lap()` algorithm changes, update BOTH `src/lap.cpp`
12211270
and `inst/include/TreeDist/lap_impl.h`. The duplication is intentional — it

inst/include/TreeDist/lap_impl.h

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -145,22 +145,25 @@ cost lap(const lap_row dim,
145145
} while (loopcnt < 2);
146146

147147
// AUGMENT SOLUTION for each free row.
148-
auto& d = scratch.d;
149-
auto& predecessor = scratch.predecessor;
148+
// Restrict-qualified local pointers enable the compiler to avoid
149+
// reloads after stores in the Dijkstra inner loop.
150+
cost* __restrict__ d_ptr = scratch.d.data();
151+
lap_row* __restrict__ pred_ptr = scratch.predecessor.data();
152+
lap_col* __restrict__ cl_ptr = scratch.col_list.data();
150153

151154
for (lap_row f = 0; f < num_free; ++f) {
152155
bool unassignedfound = false;
153156
lap_row free_row = freeunassigned[f];
154-
const cost* free_row_cost = input_cost.row(free_row);
157+
const cost* __restrict__ free_row_cost = input_cost.row(free_row);
155158
lap_col endofpath = 0;
156159
lap_col last = 0;
157160
lap_row i;
158161
lap_col j1;
159162

160163
for (lap_col j = 0; j < dim; ++j) {
161-
d[j] = free_row_cost[j] - v_ptr[j];
162-
predecessor[j] = free_row;
163-
col_list[j] = j;
164+
d_ptr[j] = free_row_cost[j] - v_ptr[j];
165+
pred_ptr[j] = free_row;
166+
cl_ptr[j] = j;
164167
}
165168

166169
cost min = 0;
@@ -170,63 +173,63 @@ cost lap(const lap_row dim,
170173
do {
171174
if (up == low) {
172175
last = low - 1;
173-
min = d[col_list[up++]];
176+
min = d_ptr[cl_ptr[up++]];
174177

175178
for (lap_dim k = up; k < dim; ++k) {
176-
const lap_col j = col_list[k];
177-
const cost h = d[j];
179+
const lap_col j = cl_ptr[k];
180+
const cost h = d_ptr[j];
178181
if (h <= min) {
179182
if (h < min) {
180183
up = low;
181184
min = h;
182185
}
183-
col_list[k] = col_list[up];
184-
col_list[up++] = j;
186+
cl_ptr[k] = cl_ptr[up];
187+
cl_ptr[up++] = j;
185188
}
186189
}
187190
for (lap_dim k = low; k < up; ++k) {
188-
if (colsol[col_list[k]] < 0) {
189-
endofpath = col_list[k];
191+
if (colsol[cl_ptr[k]] < 0) {
192+
endofpath = cl_ptr[k];
190193
unassignedfound = true;
191194
break;
192195
}
193196
}
194197
}
195198

196199
if (!unassignedfound) {
197-
j1 = col_list[low++];
200+
j1 = cl_ptr[low++];
198201
i = colsol[j1];
199-
const cost* row_i = input_cost.row(i);
202+
const cost* __restrict__ row_i = input_cost.row(i);
200203
const cost h = row_i[j1] - v_ptr[j1] - min;
201204

202205
for (lap_dim k = up; k < dim; ++k) {
203-
const lap_col j = col_list[k];
206+
const lap_col j = cl_ptr[k];
204207
cost v2 = row_i[j] - v_ptr[j] - h;
205-
if (v2 < d[j]) {
206-
predecessor[j] = i;
208+
if (v2 < d_ptr[j]) {
209+
pred_ptr[j] = i;
207210
if (v2 == min) {
208211
if (colsol[j] < 0) {
209212
endofpath = j;
210213
unassignedfound = true;
211214
break;
212215
} else {
213-
col_list[k] = col_list[up];
214-
col_list[up++] = j;
216+
cl_ptr[k] = cl_ptr[up];
217+
cl_ptr[up++] = j;
215218
}
216219
}
217-
d[j] = v2;
220+
d_ptr[j] = v2;
218221
}
219222
}
220223
}
221224
} while (!unassignedfound);
222225

223226
for (lap_dim k = 0; k <= last; ++k) {
224-
j1 = col_list[k];
225-
v[j1] += d[j1] - min;
227+
j1 = cl_ptr[k];
228+
v[j1] += d_ptr[j1] - min;
226229
}
227230

228231
do {
229-
i = predecessor[endofpath];
232+
i = pred_ptr[endofpath];
230233
colsol[endofpath] = i;
231234
j1 = endofpath;
232235
endofpath = rowsol[i];

src/lap.cpp

Lines changed: 27 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,6 @@ cost lap(const lap_row dim,
186186
}
187187

188188
// AUGMENTING ROW REDUCTION
189-
auto& col_list = scratch.col_list; // List of columns to be scanned in various ways.
190189
int loopcnt = 0; // do-loop to be done twice.
191190

192191
do {
@@ -239,13 +238,16 @@ cost lap(const lap_row dim,
239238
} while (loopcnt < 2); // Repeat once.
240239

241240
// AUGMENT SOLUTION for each free row.
242-
auto& d = scratch.d; // 'Cost-distance' in augmenting path calculation.
243-
auto& predecessor = scratch.predecessor; // Row-predecessor of column in augmenting/alternating path.
241+
// Restrict-qualified local pointers enable the compiler to avoid
242+
// reloads after stores in the Dijkstra inner loop.
243+
cost* __restrict__ d_ptr = scratch.d.data();
244+
lap_row* __restrict__ pred_ptr = scratch.predecessor.data();
245+
lap_col* __restrict__ cl_ptr = scratch.col_list.data();
244246

245247
for (lap_row f = 0; f < num_free; ++f) {
246248
bool unassignedfound = false;
247249
lap_row free_row = freeunassigned[f]; // Start row of augmenting path.
248-
const cost* free_row_cost = input_cost.row(free_row);
250+
const cost* __restrict__ free_row_cost = input_cost.row(free_row);
249251
lap_col endofpath = 0;
250252
lap_col last = 0;
251253
lap_row i;
@@ -254,9 +256,9 @@ cost lap(const lap_row dim,
254256
// Dijkstra shortest path algorithm.
255257
// Runs until unassigned column added to shortest path tree.
256258
for (lap_col j = 0; j < dim; ++j) {
257-
d[j] = free_row_cost[j] - v_ptr[j];
258-
predecessor[j] = free_row;
259-
col_list[j] = j; // Init column list.
259+
d_ptr[j] = free_row_cost[j] - v_ptr[j];
260+
pred_ptr[j] = free_row;
261+
cl_ptr[j] = j; // Init column list.
260262
}
261263

262264
cost min = 0;
@@ -271,26 +273,26 @@ cost lap(const lap_row dim,
271273

272274
// Scan columns for up..dim-1 to find all indices for which new minimum occurs.
273275
// Store these indices between low..up-1 (increasing up).
274-
min = d[col_list[up++]];
276+
min = d_ptr[cl_ptr[up++]];
275277

276278
for (lap_dim k = up; k < dim; ++k) {
277-
const lap_col j = col_list[k];
278-
const cost h = d[j];
279+
const lap_col j = cl_ptr[k];
280+
const cost h = d_ptr[j];
279281
if (h <= min) {
280282
if (h < min) { // New minimum.
281283
up = low; // Restart list at index low.
282284
min = h;
283285
}
284286
// New index with same minimum, put on undex up, and extend list.
285-
col_list[k] = col_list[up];
286-
col_list[up++] = j;
287+
cl_ptr[k] = cl_ptr[up];
288+
cl_ptr[up++] = j;
287289
}
288290
}
289291
// Check if any of the minimum columns happens to be unassigned.
290292
// If so, we have an augmenting path right away.
291293
for (lap_dim k = low; k < up; ++k) {
292-
if (colsol[col_list[k]] < 0) {
293-
endofpath = col_list[k];
294+
if (colsol[cl_ptr[k]] < 0) {
295+
endofpath = cl_ptr[k];
294296
unassignedfound = true;
295297
break;
296298
}
@@ -300,16 +302,16 @@ cost lap(const lap_row dim,
300302
if (!unassignedfound) {
301303
// Update 'distances' between free_row and all unscanned columns,
302304
// via next scanned column.
303-
j1 = col_list[low++];
305+
j1 = cl_ptr[low++];
304306
i = colsol[j1];
305-
const cost* row_i = input_cost.row(i);
307+
const cost* __restrict__ row_i = input_cost.row(i);
306308
const cost h = row_i[j1] - v_ptr[j1] - min;
307309

308310
for (lap_dim k = up; k < dim; ++k) {
309-
const lap_col j = col_list[k];
311+
const lap_col j = cl_ptr[k];
310312
cost v2 = row_i[j] - v_ptr[j] - h;
311-
if (v2 < d[j]) {
312-
predecessor[j] = i;
313+
if (v2 < d_ptr[j]) {
314+
pred_ptr[j] = i;
313315
if (v2 == min) { // New column found at same minimum value
314316
if (colsol[j] < 0) {
315317
// If unassigned, shortest augmenting path is complete.
@@ -318,25 +320,25 @@ cost lap(const lap_row dim,
318320
break;
319321
} else {
320322
// Else add to list to be scanned right away.
321-
col_list[k] = col_list[up];
322-
col_list[up++] = j;
323+
cl_ptr[k] = cl_ptr[up];
324+
cl_ptr[up++] = j;
323325
}
324326
}
325-
d[j] = v2; // <MS: Unintended>
327+
d_ptr[j] = v2;
326328
}
327329
}
328330
}
329331
} while (!unassignedfound);
330332

331333
// Update column prices.
332334
for(lap_dim k = 0; k <= last; ++k) {
333-
j1 = col_list[k];
334-
v[j1] += d[j1] - min;
335+
j1 = cl_ptr[k];
336+
v[j1] += d_ptr[j1] - min;
335337
}
336338

337339
// Reset row and column assignments along the alternating path.
338340
do {
339-
i = predecessor[endofpath];
341+
i = pred_ptr[endofpath];
340342
colsol[endofpath] = i;
341343
j1 = endofpath;
342344
endofpath = rowsol[i];

0 commit comments

Comments
 (0)