Skip to content

Commit 96e868c

Browse files
committed
gpu: jit: codegen: align operands in efdiv()
1 parent 068893e commit 96e868c

File tree

1 file changed

+110
-29
lines changed

1 file changed

+110
-29
lines changed

src/gpu/jit/codegen/kernel.hpp

Lines changed: 110 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*******************************************************************************
2-
* Copyright 2022 Intel Corporation
2+
* Copyright 2022-2023 Intel Corporation
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -527,61 +527,56 @@ class ir_kernel_t : public jit_generator<hw> {
527527

528528
void efdiv(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
529529
const ngen_operand_t &src0, const ngen_operand_t &src1) {
530-
ir_assert(!src1.is_immediate());
531-
532530
int esize = mod.getExecSize();
533531
int grf_size = ngen::GRF::bytes(hw);
534532
int div_esize = std::min(esize, grf_size / int(sizeof(float)));
535533

536-
int tmp_regs = utils::div_up(esize * int(sizeof(float)), grf_size);
534+
ir_assert(dst.type() == ngen::DataType::f);
535+
ir_assert(src0.type() == ngen::DataType::f);
536+
ir_assert(src1.type() == ngen::DataType::f);
537+
ir_assert(src1.reg_data().getHS() == 0);
537538

538539
// fdiv_ieee() is not supported in XeHPG so we use a less precise, inv-based sequence.
539540
if (hw < ngen::HW::XeHPC) {
540-
auto tmp = ra_.alloc_range(tmp_regs);
541-
auto tmp_buf = reg_buf_t(hw, tmp);
542-
auto tmp_reg_buf = reg_buf_data_t(tmp_buf).format(
543-
0, src1.reg_buf_data().type(), esize);
544-
inv(mod, tmp[0].f(), src1.reg_buf_data());
545-
emul(mod, dst, src0, ngen_operand_t(tmp_reg_buf));
541+
auto tmp = ra_.alloc_sub<float>();
542+
inv(1, tmp, src1.reg_data());
543+
emul(mod, dst, src0, ngen_operand_t(reg_buf_data_t(hw, tmp)));
546544
ra_.safeRelease(tmp);
547545
return;
548546
}
549547

550548
auto one = ra_.alloc().f();
551549
auto zero = ra_.alloc().f();
552-
553550
auto tmp = ra_.alloc_range(4);
554551

555-
auto src0_tmp = ra_.alloc_range(tmp_regs);
556-
auto src1_tmp = ra_.alloc_range(tmp_regs);
557-
558-
// Copy to temporary registers to ensure dst, num and denom are
559-
// distinct as required for fdiv_ieee.
560-
mov(mod, src0_tmp[0].f(), src0.reg_data());
561-
mov(mod, src1_tmp[0].f(), src1.reg_data());
562-
563552
auto div_mod = ngen::InstructionModifier(mod);
564553
div_mod.setExecSize(div_esize);
565554

566555
mov(div_mod, one, ngen::Immediate(1));
567556
mov(div_mod, zero, ngen::Immediate(0));
568557

569-
// Enable mask as fdiv_ieee relies on masked if/endif flow.
570-
setDefaultNoMask(false);
571-
572558
for (int i = 0; i < mod.getExecSize(); i += div_esize) {
573-
fdiv_ieee(div_mod, f0[0], dst.sub_reg_data(i, div_esize).reg_data(),
574-
src0_tmp[i / div_esize].f(), src1_tmp[i / div_esize].f(),
575-
zero, one, tmp);
559+
// Copy to temporary registers to ensure dst, num and denom are
560+
// distinct as required for fdiv_ieee.
561+
auto d = dst.sub_reg_data(i, div_esize).reg_data();
562+
auto s0 = src0.sub_reg_data(i, div_esize).reg_data();
563+
auto s1 = src1.sub_reg_data(i, 1).reg_data();
564+
bool force_spill = overlaps(div_esize, d, s0)
565+
|| overlaps(div_esize, d, s1)
566+
|| overlaps(div_esize, s0, s1);
567+
auto dst_rd = w_spill(d, div_esize, force_spill);
568+
auto src0_rd = r_spill(s0, div_esize, force_spill);
569+
auto src1_rd = r_spill(s1, div_esize, force_spill);
570+
// Enable mask as fdiv_ieee relies on masked if/endif flow.
571+
setDefaultNoMask(false);
572+
fdiv_ieee(div_mod, f0[0], dst_rd(), src0_rd(), src1_rd(), zero, one,
573+
tmp);
574+
setDefaultNoMask(true);
576575
}
577576

578577
ra_.safeRelease(one);
579578
ra_.safeRelease(zero);
580-
ra_.safeRelease(src0_tmp);
581-
ra_.safeRelease(src1_tmp);
582579
ra_.safeRelease(tmp);
583-
584-
setDefaultNoMask(true);
585580
}
586581

587582
void emod(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
@@ -788,6 +783,92 @@ class ir_kernel_t : public jit_generator<hw> {
788783
}
789784

790785
protected:
786+
class spiller_t {
787+
public:
788+
spiller_t(ir_kernel_t<hw> *host, const ngen::RegData &rd, int esize,
789+
bool read, bool write, bool force_copy)
790+
: host_(host), rd_(rd), esize_(esize), read_(read), write_(write) {
791+
if (rd.getOffset() == 0 && !force_copy) return;
792+
793+
int w = rd.getWidth();
794+
int hs = rd.getHS();
795+
int vs = rd.getVS();
796+
int grf_size = ngen::GRF::bytes(hw);
797+
int regs = utils::div_up(esize * hs * rd.getBytes(), grf_size);
798+
tmp_range_ = host_->ra_.alloc_range(regs);
799+
auto tmp = tmp_range_[0].retype(rd_.getType());
800+
tmp_ = ngen::RegisterRegion(tmp, vs, w, hs);
801+
if (read_) host_->mov(esize_, to_xd(tmp_), to_xd(rd_));
802+
}
803+
804+
spiller_t(spiller_t &&other) : spiller_t(other) {
805+
other.tmp_range_ = ngen::GRFRange();
806+
}
807+
808+
ngen::RegData operator()() const {
809+
return tmp_.isInvalid() ? rd_ : tmp_;
810+
}
811+
812+
~spiller_t() {
813+
if (tmp_range_.isInvalid()) return;
814+
if (write_) host_->mov(esize_, to_xd(rd_), to_xd(tmp_));
815+
host_->ra_.safeRelease(tmp_range_);
816+
}
817+
818+
private:
819+
spiller_t(const spiller_t &) = default;
820+
821+
static ngen::RegData to_xd(const ngen::RegData &rd) {
822+
auto ret = rd;
823+
switch (rd.getBytes()) {
824+
case 1: ret.setType(ngen::DataType::ub); break;
825+
case 2: ret.setType(ngen::DataType::uw); break;
826+
case 4: ret.setType(ngen::DataType::ud); break;
827+
default: ir_error_not_expected();
828+
}
829+
return ret;
830+
}
831+
832+
ir_kernel_t<hw> *host_ = nullptr;
833+
ngen::RegData rd_;
834+
int esize_;
835+
bool read_ = false;
836+
bool write_ = false;
837+
ngen::GRFRange tmp_range_;
838+
ngen::RegData tmp_;
839+
};
840+
841+
spiller_t spill(const ngen::RegData &rd, int esize, bool read, bool write,
842+
bool force_copy) {
843+
return spiller_t(this, rd, esize, read, write, force_copy);
844+
}
845+
846+
spiller_t r_spill(
847+
const ngen::RegData &rd, int esize, bool force_copy = false) {
848+
return spill(rd, esize, true, false, force_copy);
849+
}
850+
851+
spiller_t w_spill(
852+
const ngen::RegData &rd, int esize, bool force_copy = false) {
853+
return spill(rd, esize, false, true, force_copy);
854+
}
855+
856+
static bool overlaps(
857+
int esize, const ngen::RegData &a, const ngen::RegData &b) {
858+
int grf_size = ngen::GRF::bytes(hw);
859+
int a_beg = a.getBase() * grf_size + a.getByteOffset();
860+
int b_beg = b.getBase() * grf_size + b.getByteOffset();
861+
int a_end = a_beg + esize * a.getHS() * a.getBytes() - 1;
862+
int b_end = b_beg + esize * b.getHS() * b.getBytes() - 1;
863+
a_beg /= grf_size;
864+
b_beg /= grf_size;
865+
a_end /= grf_size;
866+
b_end /= grf_size;
867+
if (a_beg <= b_beg && b_beg <= a_end) return true;
868+
if (a_beg <= b_end && b_end <= a_end) return true;
869+
return false;
870+
}
871+
791872
std::string kernel_name_;
792873
exec_config_t exec_cfg_;
793874
kernel_info_t kernel_info_;

0 commit comments

Comments
 (0)