|
1 | 1 | /******************************************************************************* |
2 | | -* Copyright 2022 Intel Corporation |
| 2 | +* Copyright 2022-2023 Intel Corporation |
3 | 3 | * |
4 | 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | 5 | * you may not use this file except in compliance with the License. |
@@ -527,61 +527,56 @@ class ir_kernel_t : public jit_generator<hw> { |
527 | 527 |
|
528 | 528 | void efdiv(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
529 | 529 | const ngen_operand_t &src0, const ngen_operand_t &src1) { |
530 | | - ir_assert(!src1.is_immediate()); |
531 | | - |
532 | 530 | int esize = mod.getExecSize(); |
533 | 531 | int grf_size = ngen::GRF::bytes(hw); |
534 | 532 | int div_esize = std::min(esize, grf_size / int(sizeof(float))); |
535 | 533 |
|
536 | | - int tmp_regs = utils::div_up(esize * int(sizeof(float)), grf_size); |
| 534 | + ir_assert(dst.type() == ngen::DataType::f); |
| 535 | + ir_assert(src0.type() == ngen::DataType::f); |
| 536 | + ir_assert(src1.type() == ngen::DataType::f); |
| 537 | + ir_assert(src1.reg_data().getHS() == 0); |
537 | 538 |
|
538 | 539 | // fdiv_ieee() is not supported in XeHPG so we use a less precise, inv-based sequence. |
539 | 540 | if (hw < ngen::HW::XeHPC) { |
540 | | - auto tmp = ra_.alloc_range(tmp_regs); |
541 | | - auto tmp_buf = reg_buf_t(hw, tmp); |
542 | | - auto tmp_reg_buf = reg_buf_data_t(tmp_buf).format( |
543 | | - 0, src1.reg_buf_data().type(), esize); |
544 | | - inv(mod, tmp[0].f(), src1.reg_buf_data()); |
545 | | - emul(mod, dst, src0, ngen_operand_t(tmp_reg_buf)); |
| 541 | + auto tmp = ra_.alloc_sub<float>(); |
| 542 | + inv(1, tmp, src1.reg_data()); |
| 543 | + emul(mod, dst, src0, ngen_operand_t(reg_buf_data_t(hw, tmp))); |
546 | 544 | ra_.safeRelease(tmp); |
547 | 545 | return; |
548 | 546 | } |
549 | 547 |
|
550 | 548 | auto one = ra_.alloc().f(); |
551 | 549 | auto zero = ra_.alloc().f(); |
552 | | - |
553 | 550 | auto tmp = ra_.alloc_range(4); |
554 | 551 |
|
555 | | - auto src0_tmp = ra_.alloc_range(tmp_regs); |
556 | | - auto src1_tmp = ra_.alloc_range(tmp_regs); |
557 | | - |
558 | | - // Copy to temporary registers to ensure dst, num and denom are |
559 | | - // distinct as required for fdiv_ieee. |
560 | | - mov(mod, src0_tmp[0].f(), src0.reg_data()); |
561 | | - mov(mod, src1_tmp[0].f(), src1.reg_data()); |
562 | | - |
563 | 552 | auto div_mod = ngen::InstructionModifier(mod); |
564 | 553 | div_mod.setExecSize(div_esize); |
565 | 554 |
|
566 | 555 | mov(div_mod, one, ngen::Immediate(1)); |
567 | 556 | mov(div_mod, zero, ngen::Immediate(0)); |
568 | 557 |
|
569 | | - // Enable mask as fdiv_ieee relies on masked if/endif flow. |
570 | | - setDefaultNoMask(false); |
571 | | - |
572 | 558 | for (int i = 0; i < mod.getExecSize(); i += div_esize) { |
573 | | - fdiv_ieee(div_mod, f0[0], dst.sub_reg_data(i, div_esize).reg_data(), |
574 | | - src0_tmp[i / div_esize].f(), src1_tmp[i / div_esize].f(), |
575 | | - zero, one, tmp); |
| 559 | + // Copy to temporary registers to ensure dst, num and denom are |
| 560 | + // distinct as required for fdiv_ieee. |
| 561 | + auto d = dst.sub_reg_data(i, div_esize).reg_data(); |
| 562 | + auto s0 = src0.sub_reg_data(i, div_esize).reg_data(); |
| 563 | + auto s1 = src1.sub_reg_data(i, 1).reg_data(); |
| 564 | + bool force_spill = overlaps(div_esize, d, s0) |
| 565 | + || overlaps(div_esize, d, s1) |
| 566 | + || overlaps(div_esize, s0, s1); |
| 567 | + auto dst_rd = w_spill(d, div_esize, force_spill); |
| 568 | + auto src0_rd = r_spill(s0, div_esize, force_spill); |
| 569 | + auto src1_rd = r_spill(s1, div_esize, force_spill); |
| 570 | + // Enable mask as fdiv_ieee relies on masked if/endif flow. |
| 571 | + setDefaultNoMask(false); |
| 572 | + fdiv_ieee(div_mod, f0[0], dst_rd(), src0_rd(), src1_rd(), zero, one, |
| 573 | + tmp); |
| 574 | + setDefaultNoMask(true); |
576 | 575 | } |
577 | 576 |
|
578 | 577 | ra_.safeRelease(one); |
579 | 578 | ra_.safeRelease(zero); |
580 | | - ra_.safeRelease(src0_tmp); |
581 | | - ra_.safeRelease(src1_tmp); |
582 | 579 | ra_.safeRelease(tmp); |
583 | | - |
584 | | - setDefaultNoMask(true); |
585 | 580 | } |
586 | 581 |
|
587 | 582 | void emod(const ngen::InstructionModifier &mod, const ngen_operand_t &dst, |
@@ -788,6 +783,92 @@ class ir_kernel_t : public jit_generator<hw> { |
788 | 783 | } |
789 | 784 |
|
790 | 785 | protected: |
| 786 | + class spiller_t { |
| 787 | + public: |
| 788 | + spiller_t(ir_kernel_t<hw> *host, const ngen::RegData &rd, int esize, |
| 789 | + bool read, bool write, bool force_copy) |
| 790 | + : host_(host), rd_(rd), esize_(esize), read_(read), write_(write) { |
| 791 | + if (rd.getOffset() == 0 && !force_copy) return; |
| 792 | + |
| 793 | + int w = rd.getWidth(); |
| 794 | + int hs = rd.getHS(); |
| 795 | + int vs = rd.getVS(); |
| 796 | + int grf_size = ngen::GRF::bytes(hw); |
| 797 | + int regs = utils::div_up(esize * hs * rd.getBytes(), grf_size); |
| 798 | + tmp_range_ = host_->ra_.alloc_range(regs); |
| 799 | + auto tmp = tmp_range_[0].retype(rd_.getType()); |
| 800 | + tmp_ = ngen::RegisterRegion(tmp, vs, w, hs); |
| 801 | + if (read_) host_->mov(esize_, to_xd(tmp_), to_xd(rd_)); |
| 802 | + } |
| 803 | + |
| 804 | + spiller_t(spiller_t &&other) : spiller_t(other) { |
| 805 | + other.tmp_range_ = ngen::GRFRange(); |
| 806 | + } |
| 807 | + |
| 808 | + ngen::RegData operator()() const { |
| 809 | + return tmp_.isInvalid() ? rd_ : tmp_; |
| 810 | + } |
| 811 | + |
| 812 | + ~spiller_t() { |
| 813 | + if (tmp_range_.isInvalid()) return; |
| 814 | + if (write_) host_->mov(esize_, to_xd(rd_), to_xd(tmp_)); |
| 815 | + host_->ra_.safeRelease(tmp_range_); |
| 816 | + } |
| 817 | + |
| 818 | + private: |
| 819 | + spiller_t(const spiller_t &) = default; |
| 820 | + |
| 821 | + static ngen::RegData to_xd(const ngen::RegData &rd) { |
| 822 | + auto ret = rd; |
| 823 | + switch (rd.getBytes()) { |
| 824 | + case 1: ret.setType(ngen::DataType::ub); break; |
| 825 | + case 2: ret.setType(ngen::DataType::uw); break; |
| 826 | + case 4: ret.setType(ngen::DataType::ud); break; |
| 827 | + default: ir_error_not_expected(); |
| 828 | + } |
| 829 | + return ret; |
| 830 | + } |
| 831 | + |
| 832 | + ir_kernel_t<hw> *host_ = nullptr; |
| 833 | + ngen::RegData rd_; |
| 834 | + int esize_; |
| 835 | + bool read_ = false; |
| 836 | + bool write_ = false; |
| 837 | + ngen::GRFRange tmp_range_; |
| 838 | + ngen::RegData tmp_; |
| 839 | + }; |
| 840 | + |
| 841 | + spiller_t spill(const ngen::RegData &rd, int esize, bool read, bool write, |
| 842 | + bool force_copy) { |
| 843 | + return spiller_t(this, rd, esize, read, write, force_copy); |
| 844 | + } |
| 845 | + |
| 846 | + spiller_t r_spill( |
| 847 | + const ngen::RegData &rd, int esize, bool force_copy = false) { |
| 848 | + return spill(rd, esize, true, false, force_copy); |
| 849 | + } |
| 850 | + |
| 851 | + spiller_t w_spill( |
| 852 | + const ngen::RegData &rd, int esize, bool force_copy = false) { |
| 853 | + return spill(rd, esize, false, true, force_copy); |
| 854 | + } |
| 855 | + |
| 856 | + static bool overlaps( |
| 857 | + int esize, const ngen::RegData &a, const ngen::RegData &b) { |
| 858 | + int grf_size = ngen::GRF::bytes(hw); |
| 859 | + int a_beg = a.getBase() * grf_size + a.getByteOffset(); |
| 860 | + int b_beg = b.getBase() * grf_size + b.getByteOffset(); |
| 861 | + int a_end = a_beg + esize * a.getHS() * a.getBytes() - 1; |
| 862 | + int b_end = b_beg + esize * b.getHS() * b.getBytes() - 1; |
| 863 | + a_beg /= grf_size; |
| 864 | + b_beg /= grf_size; |
| 865 | + a_end /= grf_size; |
| 866 | + b_end /= grf_size; |
| 867 | + if (a_beg <= b_beg && b_beg <= a_end) return true; |
| 868 | + if (a_beg <= b_end && b_end <= a_end) return true; |
| 869 | + return false; |
| 870 | + } |
| 871 | + |
791 | 872 | std::string kernel_name_; |
792 | 873 | exec_config_t exec_cfg_; |
793 | 874 | kernel_info_t kernel_info_; |
|
0 commit comments