diff --git a/Makefile b/Makefile index faea4fb..a279b69 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ LDFLAGS = -pie SOURCES = src/main.c \ src/fe/preproc.c src/fe/lexer.c src/fe/parser.c src/fe/sema.c \ - src/ir/bir.c src/ir/bir_print.c src/ir/bir_lower.c src/ir/bir_mem2reg.c \ + src/ir/bir.c src/ir/bir_print.c src/ir/bir_lower.c src/ir/bir_mem2reg.c src/ir/bir_dce.c \ src/amdgpu/isel.c src/amdgpu/emit.c src/amdgpu/encode.c src/amdgpu/enc_tab.c OBJECTS = $(SOURCES:.c=.o) TARGET = barracuda @@ -29,9 +29,10 @@ $(TARGET): $(OBJECTS) TCFLAGS = -std=c99 -D_POSIX_C_SOURCE=200809L -Wall -Wextra -O0 -g \ -Isrc -Isrc/fe -Isrc/ir -Isrc/amdgpu TSRC = tests/tmain.c tests/tsmoke.c tests/tcomp.c tests/tenc.c \ - tests/ttabs.c tests/ttypes.c tests/terrs.c tests/tphase.c + tests/ttabs.c tests/ttypes.c tests/terrs.c tests/tphase.c \ + tests/tdce.c TOBJS = $(TSRC:.c=.o) -COBJS = src/ir/bir.o src/ir/bir_print.o src/ir/bir_lower.o src/ir/bir_mem2reg.o \ +COBJS = src/ir/bir.o src/ir/bir_print.o src/ir/bir_lower.o src/ir/bir_mem2reg.o src/ir/bir_dce.o \ src/amdgpu/encode.o src/amdgpu/enc_tab.o src/amdgpu/isel.o src/amdgpu/emit.o \ src/fe/lexer.o src/fe/parser.o src/fe/preproc.o src/fe/sema.o diff --git a/src/ir/bir_dce.c b/src/ir/bir_dce.c new file mode 100644 index 0000000..e120cac --- /dev/null +++ b/src/ir/bir_dce.c @@ -0,0 +1,348 @@ +#include "bir_dce.h" +#include + +/* + * bir_dce: dead code elimination. + * + * Remove instructions whose results are never referenced and that + * have no side effects. Iterates to fixpoint since removing one + * dead instruction may make its operands' producers dead too. + * Then compact surviving instructions and close inter-function gaps. + */ + +#define OPT_UNDEF 0xFFFFFFFFu + +/* ---- Working State ---- */ + +typedef struct { + bir_module_t *M; + uint32_t func_idx; + uint32_t base_block, num_blocks; + uint32_t base_inst; + uint32_t num_insts; + uint32_t dead[BIR_MAX_INSTS / 32]; + uint32_t use_count[BIR_MAX_INSTS]; + uint32_t inst_renum[BIR_MAX_INSTS]; +} opt_t; + +static opt_t G; + +/* ---- Helpers ---- */ + +/* Is inline operand j a block reference (not a value reference)? */ +static int is_inline_block_ref(uint16_t op, uint8_t j) +{ + switch (op) { + case BIR_BR: return j == 0; + case BIR_BR_COND: return j == 1 || j == 2; + case BIR_SWITCH: return j == 1; + case BIR_PHI: return j % 2 == 0; + default: return 0; + } +} + +/* Is extra operand j a block reference? */ +static int is_extra_block_ref(uint16_t op, uint32_t j) +{ + if (op == BIR_PHI) return j % 2 == 0; + if (op == BIR_SWITCH) return j == 1 || (j >= 3 && j % 2 == 1); + return 0; +} + +/* Check if an opcode is pure (no side effects, safe to eliminate) */ +static int is_pure_op(uint16_t op) +{ + switch (op) { + /* Arithmetic */ + case BIR_ADD: case BIR_SUB: case BIR_MUL: + case BIR_SDIV: case BIR_UDIV: case BIR_SREM: case BIR_UREM: + case BIR_FADD: case BIR_FSUB: case BIR_FMUL: case BIR_FDIV: + case BIR_FREM: + /* Bitwise */ + case BIR_AND: case BIR_OR: case BIR_XOR: + case BIR_SHL: case BIR_LSHR: case BIR_ASHR: + /* Comparison */ + case BIR_ICMP: case BIR_FCMP: + /* Conversion */ + case BIR_TRUNC: case BIR_ZEXT: case BIR_SEXT: + case BIR_FPTRUNC: case BIR_FPEXT: + case BIR_FPTOSI: case BIR_FPTOUI: case BIR_SITOFP: case BIR_UITOFP: + case BIR_PTRTOINT: case BIR_INTTOPTR: case BIR_BITCAST: + /* Math */ + case BIR_SQRT: case BIR_RSQ: case BIR_RCP: + case BIR_EXP2: case BIR_LOG2: + case BIR_SIN: case BIR_COS: + case BIR_FABS: case BIR_FLOOR: case BIR_CEIL: + case BIR_FTRUNC: case BIR_RNDNE: + case BIR_FMAX: case BIR_FMIN: + /* SSA */ + case BIR_PHI: + /* Memory (no observable effect if unused) */ + case BIR_ALLOCA: case BIR_SHARED_ALLOC: case BIR_GLOBAL_REF: case BIR_GEP: + /* Thread model (read-only intrinsics) */ + case BIR_THREAD_ID: case BIR_BLOCK_ID: + case BIR_BLOCK_DIM: case BIR_GRID_DIM: + /* Misc */ + case BIR_SELECT: + return 1; + default: + return 0; + } +} + +/* ---- Dead Code Elimination ---- */ + +static int dce_pass(opt_t *S) +{ + bir_module_t *M = S->M; + uint32_t base = S->base_inst; + uint32_t end = base + S->num_insts; + int changes = 0; + int changed = 1; + int guard = 100; + + while (changed && --guard) { + changed = 0; + + /* Build use counts for instructions in this function */ + memset(S->use_count + base, 0, S->num_insts * sizeof(uint32_t)); + + for (uint32_t i = base; i < end; i++) { + if ((S->dead[i / 32] >> (i % 32)) & 1) continue; + const bir_inst_t *I = &M->insts[i]; + + if (I->num_operands == BIR_OPERANDS_OVERFLOW) { + uint32_t start = I->operands[0]; + uint32_t count = I->operands[1]; + for (uint32_t j = 0; j < count + && (start + j) < M->num_extra_ops; j++) { + if (is_extra_block_ref(I->op, j)) continue; + uint32_t ref = M->extra_operands[start + j]; + if (ref != BIR_VAL_NONE && !BIR_VAL_IS_CONST(ref)) { + uint32_t idx = BIR_VAL_INDEX(ref); + if (idx >= base && idx < end) + S->use_count[idx]++; + } + } + } else { + for (uint8_t j = 0; j < I->num_operands + && j < BIR_OPERANDS_INLINE; j++) { + if (is_inline_block_ref(I->op, j)) continue; + uint32_t ref = I->operands[j]; + if (ref != BIR_VAL_NONE && !BIR_VAL_IS_CONST(ref)) { + uint32_t idx = BIR_VAL_INDEX(ref); + if (idx >= base && idx < end) + S->use_count[idx]++; + } + } + } + } + + /* Remove dead pure instructions */ + for (uint32_t i = base; i < end; i++) { + if ((S->dead[i / 32] >> (i % 32)) & 1) continue; + const bir_inst_t *inst = &M->insts[i]; + int pure = is_pure_op(inst->op); + if (!pure && inst->op == BIR_LOAD && inst->subop == 0) + pure = 1; /* non-volatile load, unused result = dead */ + if (S->use_count[i] == 0 && pure) { + S->dead[i / 32] |= 1u << (i % 32); + changed = 1; + changes++; + } + } + } + + return changes; +} + +/* ---- Compact ---- */ + +/* Renumber a single operand after dead instruction removal */ +static uint32_t remap_operand(uint32_t ref, const uint32_t *inst_renum, + uint32_t max_inst) +{ + if (ref == BIR_VAL_NONE || BIR_VAL_IS_CONST(ref)) + return ref; + + uint32_t idx = BIR_VAL_INDEX(ref); + if (idx < max_inst && inst_renum[idx] != OPT_UNDEF) + return BIR_MAKE_VAL(inst_renum[idx]); + + return ref; +} + +static void compact(opt_t *S) +{ + bir_module_t *M = S->M; + uint32_t old_num_insts = M->num_insts; + bir_func_t *F = &M->funcs[S->func_idx]; + + /* Build inst_renum: old absolute index -> new absolute index */ + for (uint32_t i = 0; i < old_num_insts; i++) + S->inst_renum[i] = OPT_UNDEF; + + uint32_t new_idx = S->base_inst; + for (uint32_t bi = 0; bi < S->num_blocks; bi++) { + uint32_t abs_b = S->base_block + bi; + const bir_block_t *B = &M->blocks[abs_b]; + for (uint32_t j = 0; j < B->num_insts; j++) { + uint32_t ii = B->first_inst + j; + if ((S->dead[ii / 32] >> (ii % 32)) & 1) continue; + S->inst_renum[ii] = new_idx++; + } + } + + uint32_t new_total = new_idx - S->base_inst; + + /* Compact in-place: write cursor <= read cursor since we only + * remove instructions, so no scratch buffer needed. */ + uint32_t wr = S->base_inst; + for (uint32_t bi = 0; bi < S->num_blocks; bi++) { + uint32_t abs_b = S->base_block + bi; + const bir_block_t *B = &M->blocks[abs_b]; + uint32_t block_start = wr; + for (uint32_t j = 0; j < B->num_insts; j++) { + uint32_t ii = B->first_inst + j; + if ((S->dead[ii / 32] >> (ii % 32)) & 1) continue; + if (wr != ii) + M->insts[wr] = M->insts[ii]; + wr++; + } + M->blocks[abs_b].first_inst = block_start; + M->blocks[abs_b].num_insts = wr - block_start; + } + + F->total_insts = new_total; + + /* Remap all operands in this function's instructions */ + for (uint32_t i = S->base_inst; i < S->base_inst + new_total; i++) { + bir_inst_t *I = &M->insts[i]; + + if (I->num_operands == BIR_OPERANDS_OVERFLOW) { + uint32_t start = I->operands[0]; + uint32_t count = I->operands[1]; + for (uint32_t j = 0; j < count + && (start + j) < M->num_extra_ops; j++) { + if (is_extra_block_ref(I->op, j)) continue; + M->extra_operands[start + j] = + remap_operand(M->extra_operands[start + j], + S->inst_renum, old_num_insts); + } + } else { + for (uint8_t j = 0; j < I->num_operands + && j < BIR_OPERANDS_INLINE; j++) { + if (is_inline_block_ref(I->op, j)) continue; + I->operands[j] = + remap_operand(I->operands[j], S->inst_renum, + old_num_insts); + } + } + } +} + +/* ---- Per-Function Driver ---- */ + +static int opt_run_func(opt_t *S, uint32_t fi) +{ + const bir_func_t *F = &S->M->funcs[fi]; + if (F->num_blocks == 0 || F->total_insts == 0) return 0; + + S->func_idx = fi; + S->base_block = F->first_block; + S->num_blocks = F->num_blocks; + S->base_inst = S->M->blocks[F->first_block].first_inst; + S->num_insts = F->total_insts; + + { + uint32_t lo = S->base_inst / 32; + uint32_t hi = (S->base_inst + S->num_insts + 31) / 32; + memset(S->dead + lo, 0, (hi - lo) * sizeof(uint32_t)); + } + + int changes = dce_pass(S); + + if (changes > 0) + compact(S); + + return changes; +} + +/* ---- Public API ---- */ + +int bir_dce(bir_module_t *M) +{ + opt_t *S = &G; + memset(S, 0, sizeof(*S)); + S->M = M; + + int total = 0; + for (uint32_t fi = 0; fi < M->num_funcs; fi++) + total += opt_run_func(S, fi); + + /* Close inter-function gaps (same pattern as bir_mem2reg) */ + if (total > 0) { + uint32_t dst = 0; + for (uint32_t fi = 0; fi < M->num_funcs; fi++) { + bir_func_t *F = &M->funcs[fi]; + if (F->num_blocks == 0) continue; + + uint32_t src = M->blocks[F->first_block].first_inst; + uint32_t count = F->total_insts; + + if (src == dst) { + dst += count; + continue; + } + + int32_t shift = (int32_t)dst - (int32_t)src; + + memmove(&M->insts[dst], &M->insts[src], + count * sizeof(bir_inst_t)); + + for (uint16_t bi = 0; bi < F->num_blocks; bi++) { + uint32_t abs_b = F->first_block + bi; + M->blocks[abs_b].first_inst = + (uint32_t)((int32_t)M->blocks[abs_b].first_inst + shift); + } + + for (uint32_t i = dst; i < dst + count; i++) { + bir_inst_t *I = &M->insts[i]; + if (I->num_operands == BIR_OPERANDS_OVERFLOW) { + uint32_t start = I->operands[0]; + uint32_t cnt = I->operands[1]; + for (uint32_t j = 0; j < cnt + && (start + j) < M->num_extra_ops; j++) { + if (is_extra_block_ref(I->op, j)) continue; + uint32_t ref = M->extra_operands[start + j]; + if (BIR_VAL_IS_CONST(ref) || ref == BIR_VAL_NONE) + continue; + uint32_t idx = BIR_VAL_INDEX(ref); + if (idx >= src && idx < src + count) + M->extra_operands[start + j] = + BIR_MAKE_VAL( + (uint32_t)((int32_t)idx + shift)); + } + } else { + for (uint8_t j = 0; j < I->num_operands + && j < BIR_OPERANDS_INLINE; j++) { + if (is_inline_block_ref(I->op, j)) continue; + uint32_t ref = I->operands[j]; + if (BIR_VAL_IS_CONST(ref) || ref == BIR_VAL_NONE) + continue; + uint32_t idx = BIR_VAL_INDEX(ref); + if (idx >= src && idx < src + count) + I->operands[j] = + BIR_MAKE_VAL( + (uint32_t)((int32_t)idx + shift)); + } + } + } + + dst += count; + } + M->num_insts = dst; + } + + return total; +} diff --git a/src/ir/bir_dce.h b/src/ir/bir_dce.h new file mode 100644 index 0000000..54081cd --- /dev/null +++ b/src/ir/bir_dce.h @@ -0,0 +1,16 @@ +#ifndef BARRACUDA_BIR_DCE_H +#define BARRACUDA_BIR_DCE_H + +#include "bir.h" + +/* + * Dead code elimination. + * + * Runs after mem2reg. Removes instructions whose results + * are never used and that have no side effects. + * + * Returns the total number of instructions removed (>= 0). + */ +int bir_dce(bir_module_t *M); + +#endif /* BARRACUDA_BIR_DCE_H */ diff --git a/src/main.c b/src/main.c index 8d0ca1a..dba71f2 100644 --- a/src/main.c +++ b/src/main.c @@ -4,6 +4,7 @@ #include "sema.h" #include "bir_lower.h" #include "bir_mem2reg.h" +#include "bir_dce.h" #include "amdgpu.h" #include @@ -60,6 +61,7 @@ static void usage(const char *prog) " --parse Parse and dump AST\n" " --ir Lower to BIR and print IR\n" " --no-mem2reg Skip mem2reg optimization pass\n" + " --no-dce Skip dead code elimination\n" " --sema Run semantic analysis and dump types\n" " --pp Preprocess only and print result\n" " --no-pp Skip preprocessor\n" @@ -86,6 +88,7 @@ int main(int argc, char *argv[]) int mode_amdgpu = 0; int mode_amdgpu_bin = 0; int no_mem2reg = 0; + int no_dce = 0; int no_pp = 0; amd_target_t amd_target = AMD_TARGET_GFX1100; uint32_t amd_elfm = 0x41; /* EF_AMDGPU_MACH for exact chip */ @@ -168,6 +171,8 @@ int main(int argc, char *argv[]) defines[num_defines++] = argv[i] + 2; } else if (strcmp(argv[i], "--no-mem2reg") == 0) no_mem2reg = 1; + else if (strcmp(argv[i], "--no-dce") == 0) + no_dce = 1; else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { usage(argv[0]); return 0; @@ -318,6 +323,8 @@ int main(int argc, char *argv[]) if (lrc == BC_OK) { if (!no_mem2reg) bir_mem2reg(bir_module); + if (!no_dce) + bir_dce(bir_module); if (mode_ir) { bir_print_module(bir_module, stdout); diff --git a/tests/tdce.c b/tests/tdce.c new file mode 100644 index 0000000..67c9e30 --- /dev/null +++ b/tests/tdce.c @@ -0,0 +1,183 @@ +/* tdce.c -- Dead code elimination tests. + * Verify that DCE removes exactly the right instructions. */ + +#include "tharns.h" + +static char obuf[TH_BUFSZ]; +static char obuf2[TH_BUFSZ]; + +/* ---- Helpers ---- */ + +static const char *strnstr_range(const char *start, const char *end, + const char *needle) +{ + size_t nlen = strlen(needle); + for (const char *p = start; p + nlen <= end; p++) { + if (memcmp(p, needle, nlen) == 0) return p; + } + return NULL; +} + +static int count_lines(const char *start, const char *end) +{ + int n = 0; + for (const char *p = start; p < end; p++) + if (*p == '\n') n++; + return n; +} + +/* ---- dce: dead chain eliminated ---- */ + +static void dce_chain(void) +{ + int rc = th_run(BC_BIN " --ir tests/test_dce.cu", obuf, TH_BUFSZ); + CHEQ(rc, 0); + /* mul and second add (the dead chain) must be gone */ + const char *fn = strstr(obuf, "dce_chain"); + CHECK(fn != NULL); + const char *fn_end = strstr(fn, "\n}"); + CHECK(fn_end != NULL); + /* dead mul must not appear */ + CHECK(strnstr_range(fn, fn_end, "= mul") == NULL); + /* live add must survive */ + CHECK(strnstr_range(fn, fn_end, "= add") != NULL); + /* store must survive */ + CHECK(strnstr_range(fn, fn_end, "store ") != NULL); + PASS(); +} +TH_REG("dce", dce_chain) + +/* ---- dce: unused non-volatile load eliminated ---- */ + +static void dce_load(void) +{ + int rc = th_run(BC_BIN " --ir tests/test_dce.cu", obuf, TH_BUFSZ); + CHEQ(rc, 0); + const char *fn = strstr(obuf, "dce_load"); + CHECK(fn != NULL); + /* Skip past signature line to search body only */ + const char *body = strchr(fn, '\n'); + CHECK(body != NULL); + const char *fn_end = strstr(body, "\n}"); + CHECK(fn_end != NULL); + /* load and gep instructions must be eliminated */ + CHECK(strnstr_range(body, fn_end, "= load") == NULL); + CHECK(strnstr_range(body, fn_end, "= gep") == NULL); + PASS(); +} +TH_REG("dce", dce_load) + +/* ---- dce: params survive even if unused ---- */ + +static void dce_param(void) +{ + int rc1 = th_run(BC_BIN " --ir tests/test_dce.cu", obuf, TH_BUFSZ); + CHEQ(rc1, 0); + int rc2 = th_run(BC_BIN " --ir --no-dce tests/test_dce.cu", + obuf2, TH_BUFSZ); + CHEQ(rc2, 0); + /* dce_params has 4 params but only %1 is used. + * DCE must not remove any — compare instruction count. */ + const char *fn1 = strstr(obuf, "dce_params"); + const char *fn2 = strstr(obuf2, "dce_params"); + CHECK(fn1 != NULL); + CHECK(fn2 != NULL); + const char *end1 = strstr(fn1, "\n}"); + const char *end2 = strstr(fn2, "\n}"); + CHECK(end1 != NULL); + CHECK(end2 != NULL); + /* Same number of instruction lines — nothing was removed */ + CHEQ(count_lines(fn1, end1), count_lines(fn2, end2)); + /* All four params still in signature */ + CHECK(strstr(fn1, "i32 %1") != NULL); + CHECK(strstr(fn1, "i32 %2") != NULL); + CHECK(strstr(fn1, "i32 %3") != NULL); + PASS(); +} +TH_REG("dce", dce_param) + +/* ---- dce: side effects survive ---- */ + +static void dce_side(void) +{ + int rc = th_run(BC_BIN " --ir tests/test_dce.cu", obuf, TH_BUFSZ); + CHEQ(rc, 0); + const char *fn = strstr(obuf, "dce_side_effects"); + CHECK(fn != NULL); + const char *fn_end = strstr(fn, "\n}"); + CHECK(fn_end != NULL); + CHECK(strnstr_range(fn, fn_end, "store ") != NULL); + CHECK(strnstr_range(fn, fn_end, "barrier") != NULL); /* unique opcode */ + PASS(); +} +TH_REG("dce", dce_side) + +/* ---- dce: empty function unchanged ---- */ + +static void dce_empty(void) +{ + int rc = th_run(BC_BIN " --ir tests/test_dce.cu", obuf, TH_BUFSZ); + CHEQ(rc, 0); + const char *fn = strstr(obuf, "dce_empty"); + CHECK(fn != NULL); + const char *fn_end = strstr(fn, "\n}"); + CHECK(fn_end != NULL); + CHECK(strnstr_range(fn, fn_end, "ret") != NULL); + PASS(); +} +TH_REG("dce", dce_empty) + +/* ---- dce: no dead code — output identical with and without DCE ---- */ + +static void dce_nop(void) +{ + int rc1 = th_run(BC_BIN " --ir tests/test_dce.cu", obuf, TH_BUFSZ); + CHEQ(rc1, 0); + int rc2 = th_run(BC_BIN " --ir --no-dce tests/test_dce.cu", + obuf2, TH_BUFSZ); + CHEQ(rc2, 0); + /* dce_all_live has no dead code — same number of instructions */ + const char *fn1 = strstr(obuf, "dce_all_live"); + const char *fn2 = strstr(obuf2, "dce_all_live"); + CHECK(fn1 != NULL); + CHECK(fn2 != NULL); + const char *end1 = strstr(fn1, "\n}"); + const char *end2 = strstr(fn2, "\n}"); + CHECK(end1 != NULL); + CHECK(end2 != NULL); + /* Same number of instruction lines */ + CHEQ(count_lines(fn1, end1), count_lines(fn2, end2)); + /* All expected opcodes survive */ + CHECK(strnstr_range(fn1, end1, "= block_id") != NULL); + CHECK(strnstr_range(fn1, end1, "= block_dim") != NULL); + CHECK(strnstr_range(fn1, end1, "= thread_id") != NULL); + CHECK(strnstr_range(fn1, end1, "= mul") != NULL); + CHECK(strnstr_range(fn1, end1, "= fadd") != NULL); + CHECK(strnstr_range(fn1, end1, "store ") != NULL); + PASS(); +} +TH_REG("dce", dce_nop) + +/* ---- dce: instruction count drops ---- */ + +static void dce_count(void) +{ + int rc1 = th_run(BC_BIN " --ir tests/test_dce.cu", obuf, TH_BUFSZ); + CHEQ(rc1, 0); + int rc2 = th_run(BC_BIN " --ir --no-dce tests/test_dce.cu", + obuf2, TH_BUFSZ); + CHEQ(rc2, 0); + /* Parse instruction counts from the summary line */ + const char *s1 = strstr(obuf, " instructions"); + const char *s2 = strstr(obuf2, " instructions"); + CHECK(s1 != NULL); + CHECK(s2 != NULL); + /* Walk backwards to find the number */ + while (s1 > obuf && s1[-1] >= '0' && s1[-1] <= '9') s1--; + while (s2 > obuf2 && s2[-1] >= '0' && s2[-1] <= '9') s2--; + int n_opt = atoi(s1); + int n_noopt = atoi(s2); + CHECK(n_noopt > n_opt); + PASS(); +} +TH_REG("dce", dce_count) diff --git a/tests/test_dce.cu b/tests/test_dce.cu new file mode 100644 index 0000000..4b3876c --- /dev/null +++ b/tests/test_dce.cu @@ -0,0 +1,39 @@ +/* test_dce.cu — Dead code elimination test cases. + * + * Each kernel targets a specific DCE edge case. + * The test harness compiles with --ir and checks which + * instructions survive. */ + +/* Dead chain: dead1 unused, dead2 depends only on dead1 */ +__global__ void dce_chain(int *out, int a, int b) { + int live = a + b; + int dead1 = a * b; + int dead2 = dead1 + 1; + out[0] = live; +} + +/* Non-volatile load dies when unused */ +__global__ void dce_load(int *src) { + int unused_load = src[0]; +} + +/* Params always survive */ +__global__ void dce_params(int *out, int a, int b, int c) { + out[0] = a; +} + +/* Side effects survive: store, barrier */ +__global__ void dce_side_effects(int *out, int x) { + out[0] = x; + __syncthreads(); +} + +/* Empty function body */ +__global__ void dce_empty(void) { +} + +/* No dead code — everything is live */ +__global__ void dce_all_live(float *out, const float *a, const float *b) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + out[i] = a[i] + b[i]; +}