/* * Copyright © 2019 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ /** @file brw_fs_scoreboard.cpp * * Gfx12+ hardware lacks the register scoreboard logic that used to guarantee * data coherency between register reads and writes in previous generations. * This lowering pass runs after register allocation in order to make up for * it. * * It works by performing global dataflow analysis in order to determine the * set of potential dependencies of every instruction in the shader, and then * inserts any required SWSB annotations and additional SYNC instructions in * order to guarantee data coherency. * * WARNING - Access of the following (rarely used) ARF registers is not * tracked here, and require the RegDist SWSB annotation to be set * to 1 by the generator in order to avoid data races: * * - sp stack pointer * - sr0 state register * - cr0 control register * - ip instruction pointer * - tm0 timestamp register * - dbg0 debug register * - acc2-9 special accumulator registers on TGL * - mme0-7 math macro extended accumulator registers * * The following ARF registers don't need to be tracked here because data * coherency is still provided transparently by the hardware: * * - f0-1 flag registers * - n0 notification register * - tdr0 thread dependency register */ #include "brw_fs.h" #include "brw_cfg.h" using namespace brw; namespace { /** * In-order instruction accounting. * @{ */ /** * Return the RegDist pipeline the hardware will synchronize with if no * pipeline information is provided in the SWSB annotation of an * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb). */ tgl_pipe inferred_sync_pipe(const struct intel_device_info *devinfo, const fs_inst *inst) { if (devinfo->verx10 >= 125) { bool has_int_src = false, has_long_src = false; if (is_send(inst)) return TGL_PIPE_NONE; for (unsigned i = 0; i < inst->sources; i++) { if (inst->src[i].file != BAD_FILE && !inst->is_control_source(i)) { const brw_reg_type t = inst->src[i].type; has_int_src |= !brw_reg_type_is_floating_point(t); has_long_src |= type_sz(t) >= 8; } } return has_long_src ? TGL_PIPE_LONG : has_int_src ? TGL_PIPE_INT : TGL_PIPE_FLOAT; } else { return TGL_PIPE_FLOAT; } } /** * Return the RegDist pipeline that will execute an instruction, or * TGL_PIPE_NONE if the instruction is out-of-order and doesn't use the * RegDist synchronization mechanism. */ tgl_pipe inferred_exec_pipe(const struct intel_device_info *devinfo, const fs_inst *inst) { const brw_reg_type t = get_exec_type(inst); const bool is_dword_multiply = !brw_reg_type_is_floating_point(t) && ((inst->opcode == BRW_OPCODE_MUL && MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) || (inst->opcode == BRW_OPCODE_MAD && MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4)); if (is_unordered(inst)) return TGL_PIPE_NONE; else if (devinfo->verx10 < 125) return TGL_PIPE_FLOAT; else if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && type_sz(t) >= 8) return TGL_PIPE_INT; else if (inst->opcode == SHADER_OPCODE_BROADCAST && !devinfo->has_64bit_float && type_sz(t) >= 8) return TGL_PIPE_INT; else if (inst->opcode == FS_OPCODE_PACK_HALF_2x16_SPLIT) return TGL_PIPE_FLOAT; else if (type_sz(inst->dst.type) >= 8 || type_sz(t) >= 8 || is_dword_multiply) return TGL_PIPE_LONG; else if (brw_reg_type_is_floating_point(inst->dst.type)) return TGL_PIPE_FLOAT; else return TGL_PIPE_INT; } /** * Index of the \p p pipeline counter in the ordered_address vector defined * below. */ #define IDX(p) (p >= TGL_PIPE_FLOAT ? unsigned(p - TGL_PIPE_FLOAT) : \ (abort(), ~0u)) /** * Number of in-order hardware instructions for pipeline index \p contained * in this IR instruction. This determines the increment applied to the * RegDist counter calculated for any ordered dependency that crosses this * instruction. */ unsigned ordered_unit(const struct intel_device_info *devinfo, const fs_inst *inst, unsigned p) { switch (inst->opcode) { case BRW_OPCODE_SYNC: case BRW_OPCODE_DO: case SHADER_OPCODE_UNDEF: case SHADER_OPCODE_HALT_TARGET: case FS_OPCODE_SCHEDULING_FENCE: return 0; default: /* Note that the following is inaccurate for virtual instructions * that expand to more in-order instructions than assumed here, but * that can only lead to suboptimal execution ordering, data * coherency won't be impacted. Providing exact RegDist counts for * each virtual instruction would allow better ALU performance, but * it would require keeping this switch statement in perfect sync * with the generator in order to avoid data corruption. Lesson is * (again) don't use virtual instructions if you want optimal * scheduling. */ if (!is_unordered(inst) && (p == IDX(inferred_exec_pipe(devinfo, inst)) || p == IDX(TGL_PIPE_ALL))) return 1; else return 0; } } /** * Type for an instruction counter that increments for in-order * instructions only, arbitrarily denoted 'jp' throughout this lowering * pass in order to distinguish it from the regular instruction counter. * This is represented as a vector with an independent counter for each * asynchronous ALU pipeline in the EU. */ struct ordered_address { /** * Construct the ordered address of a dependency known to execute on a * single specified pipeline \p p (unless TGL_PIPE_NONE or TGL_PIPE_ALL * is provided), in which case the vector counter will be initialized * with all components equal to INT_MIN (always satisfied) except for * component IDX(p). */ ordered_address(tgl_pipe p = TGL_PIPE_NONE, int jp0 = INT_MIN) { for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) jp[q] = (p == TGL_PIPE_NONE || (IDX(p) != q && p != TGL_PIPE_ALL) ? INT_MIN : jp0); } int jp[IDX(TGL_PIPE_ALL)]; friend bool operator==(const ordered_address &jp0, const ordered_address &jp1) { for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) { if (jp0.jp[p] != jp1.jp[p]) return false; } return true; } }; /** * Return true if the specified ordered address is trivially satisfied for * all pipelines except potentially for the specified pipeline \p p. */ bool is_single_pipe(const ordered_address &jp, tgl_pipe p) { for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) { if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN) return false; } return true; } /** * Return the number of instructions in the program. */ unsigned num_instructions(const backend_shader *shader) { return shader->cfg->blocks[shader->cfg->num_blocks - 1]->end_ip + 1; } /** * Calculate the local ordered_address instruction counter at every * instruction of the shader for subsequent constant-time look-up. */ ordered_address * ordered_inst_addresses(const fs_visitor *shader) { ordered_address *jps = new ordered_address[num_instructions(shader)]; ordered_address jp(TGL_PIPE_ALL, 0); unsigned ip = 0; foreach_block_and_inst(block, fs_inst, inst, shader->cfg) { jps[ip] = jp; for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) jp.jp[p] += ordered_unit(shader->devinfo, inst, p); ip++; } return jps; } /** * Synchronization mode required for data manipulated by in-order * instructions. * * Similar to tgl_sbid_mode, but without SET mode. Defined as a separate * enum for additional type safety. The hardware doesn't provide control * over the synchronization mode for RegDist annotations, this is only used * internally in this pass in order to optimize out redundant read * dependencies where possible. */ enum tgl_regdist_mode { TGL_REGDIST_NULL = 0, TGL_REGDIST_SRC = 1, TGL_REGDIST_DST = 2 }; /** * Allow bitwise arithmetic of tgl_regdist_mode enums. */ tgl_regdist_mode operator|(tgl_regdist_mode x, tgl_regdist_mode y) { return tgl_regdist_mode(unsigned(x) | unsigned(y)); } tgl_regdist_mode operator&(tgl_regdist_mode x, tgl_regdist_mode y) { return tgl_regdist_mode(unsigned(x) & unsigned(y)); } tgl_regdist_mode & operator|=(tgl_regdist_mode &x, tgl_regdist_mode y) { return x = x | y; } tgl_regdist_mode & operator&=(tgl_regdist_mode &x, tgl_regdist_mode y) { return x = x & y; } /** @} */ /** * Representation of an equivalence relation among the set of unsigned * integers. * * Its initial state is the identity relation '~' such that i ~ j if and * only if i == j for every pair of unsigned integers i and j. */ struct equivalence_relation { equivalence_relation(unsigned n) : is(new unsigned[n]), n(n) { for (unsigned i = 0; i < n; i++) is[i] = i; } ~equivalence_relation() { delete[] is; } /** * Return equivalence class index of the specified element. Effectively * this is the numeric value of an arbitrary representative from the * equivalence class. * * Allows the evaluation of the equivalence relation according to the * rule that i ~ j if and only if lookup(i) == lookup(j). */ unsigned lookup(unsigned i) const { if (i < n && is[i] != i) return lookup(is[i]); else return i; } /** * Create an array with the results of the lookup() method for * constant-time evaluation. */ unsigned * flatten() const { unsigned *ids = new unsigned[n]; for (unsigned i = 0; i < n; i++) ids[i] = lookup(i); return ids; } /** * Mutate the existing equivalence relation minimally by imposing the * additional requirement that i ~ j. * * The algorithm updates the internal representation recursively in * order to guarantee transitivity while preserving the previously * specified equivalence requirements. */ unsigned link(unsigned i, unsigned j) { const unsigned k = lookup(i); assign(i, k); assign(j, k); return k; } private: equivalence_relation(const equivalence_relation &); equivalence_relation & operator=(const equivalence_relation &); /** * Assign the representative of \p from to be equivalent to \p to. * * At the same time the data structure is partially flattened as much as * it's possible without increasing the number of recursive calls. */ void assign(unsigned from, unsigned to) { if (from != to) { assert(from < n); if (is[from] != from) assign(is[from], to); is[from] = to; } } unsigned *is; unsigned n; }; /** * Representation of a data dependency between two instructions in the * program. * @{ */ struct dependency { /** * No dependency information. */ dependency() : ordered(TGL_REGDIST_NULL), jp(), unordered(TGL_SBID_NULL), id(0), exec_all(false) {} /** * Construct a dependency on the in-order instruction with the provided * ordered_address instruction counter. */ dependency(tgl_regdist_mode mode, const ordered_address &jp, bool exec_all) : ordered(mode), jp(jp), unordered(TGL_SBID_NULL), id(0), exec_all(exec_all) {} /** * Construct a dependency on the out-of-order instruction with the * specified synchronization token. */ dependency(tgl_sbid_mode mode, unsigned id, bool exec_all) : ordered(TGL_REGDIST_NULL), jp(), unordered(mode), id(id), exec_all(exec_all) {} /** * Synchronization mode of in-order dependency, or zero if no in-order * dependency is present. */ tgl_regdist_mode ordered; /** * Instruction counter of in-order dependency. * * For a dependency part of a different block in the program, this is * relative to the specific control flow path taken between the * dependency and the current block: It is the ordered_address such that * the difference between it and the ordered_address of the first * instruction of the current block is exactly the number of in-order * instructions across that control flow path. It is not guaranteed to * be equal to the local ordered_address of the generating instruction * [as returned by ordered_inst_addresses()], except for block-local * dependencies. */ ordered_address jp; /** * Synchronization mode of unordered dependency, or zero if no unordered * dependency is present. */ tgl_sbid_mode unordered; /** Synchronization token of out-of-order dependency. */ unsigned id; /** * Whether the dependency could be run with execution masking disabled, * which might lead to the unwanted execution of the generating * instruction in cases where a BB is executed with all channels * disabled due to hardware bug Wa_1407528679. */ bool exec_all; /** * Trivial in-order dependency that's always satisfied. * * Note that unlike a default-constructed dependency() which is also * trivially satisfied, this is considered to provide dependency * information and can be used to clear a previously pending dependency * via shadow(). */ static const dependency done; friend bool operator==(const dependency &dep0, const dependency &dep1) { return dep0.ordered == dep1.ordered && dep0.jp == dep1.jp && dep0.unordered == dep1.unordered && dep0.id == dep1.id && dep0.exec_all == dep1.exec_all; } friend bool operator!=(const dependency &dep0, const dependency &dep1) { return !(dep0 == dep1); } }; const dependency dependency::done = dependency(TGL_REGDIST_SRC, ordered_address(), false); /** * Return whether \p dep contains any dependency information. */ bool is_valid(const dependency &dep) { return dep.ordered || dep.unordered; } /** * Combine \p dep0 and \p dep1 into a single dependency object that is only * satisfied when both original dependencies are satisfied. This might * involve updating the equivalence relation \p eq in order to make sure * that both out-of-order dependencies are assigned the same hardware SBID * as synchronization token. */ dependency merge(equivalence_relation &eq, const dependency &dep0, const dependency &dep1) { dependency dep; if (dep0.ordered || dep1.ordered) { dep.ordered = dep0.ordered | dep1.ordered; for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) dep.jp.jp[p] = MAX2(dep0.jp.jp[p], dep1.jp.jp[p]); } if (dep0.unordered || dep1.unordered) { dep.unordered = dep0.unordered | dep1.unordered; dep.id = eq.link(dep0.unordered ? dep0.id : dep1.id, dep1.unordered ? dep1.id : dep0.id); } dep.exec_all = dep0.exec_all || dep1.exec_all; return dep; } /** * Override dependency information of \p dep0 with that of \p dep1. */ dependency shadow(const dependency &dep0, const dependency &dep1) { return is_valid(dep1) ? dep1 : dep0; } /** * Translate dependency information across the program. * * This returns a dependency on the same instruction translated to the * ordered_address space of a different block. The correct shift for * transporting a dependency across an edge of the CFG is the difference * between the local ordered_address of the first instruction of the target * block and the local ordered_address of the instruction immediately after * the end of the origin block. */ dependency transport(dependency dep, int delta[IDX(TGL_PIPE_ALL)]) { if (dep.ordered) { for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) { if (dep.jp.jp[p] > INT_MIN) dep.jp.jp[p] += delta[p]; } } return dep; } /** * Return simplified dependency removing any synchronization modes not * applicable to an instruction reading the same register location. */ dependency dependency_for_read(dependency dep) { dep.ordered &= TGL_REGDIST_DST; return dep; } /** * Return simplified dependency removing any synchronization modes not * applicable to an instruction \p inst writing the same register location. * * This clears any WaR dependency for writes performed from the same * pipeline as the read, since there is no possibility for a data hazard. */ dependency dependency_for_write(const struct intel_device_info *devinfo, const fs_inst *inst, dependency dep) { if (!is_unordered(inst) && is_single_pipe(dep.jp, inferred_exec_pipe(devinfo, inst))) dep.ordered &= TGL_REGDIST_DST; return dep; } /** @} */ /** * Scoreboard representation. This keeps track of the data dependencies of * registers with GRF granularity. */ class scoreboard { public: /** * Look up the most current data dependency for register \p r. */ dependency get(const fs_reg &r) const { if (const dependency *p = const_cast(this)->dep(r)) return *p; else return dependency(); } /** * Specify the most current data dependency for register \p r. */ void set(const fs_reg &r, const dependency &d) { if (dependency *p = dep(r)) *p = d; } /** * Component-wise merge() of corresponding dependencies from two * scoreboard objects. \sa merge(). */ friend scoreboard merge(equivalence_relation &eq, const scoreboard &sb0, const scoreboard &sb1) { scoreboard sb; for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++) sb.grf_deps[i] = merge(eq, sb0.grf_deps[i], sb1.grf_deps[i]); sb.addr_dep = merge(eq, sb0.addr_dep, sb1.addr_dep); sb.accum_dep = merge(eq, sb0.accum_dep, sb1.accum_dep); return sb; } /** * Component-wise shadow() of corresponding dependencies from two * scoreboard objects. \sa shadow(). */ friend scoreboard shadow(const scoreboard &sb0, const scoreboard &sb1) { scoreboard sb; for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++) sb.grf_deps[i] = shadow(sb0.grf_deps[i], sb1.grf_deps[i]); sb.addr_dep = shadow(sb0.addr_dep, sb1.addr_dep); sb.accum_dep = shadow(sb0.accum_dep, sb1.accum_dep); return sb; } /** * Component-wise transport() of dependencies from a scoreboard * object. \sa transport(). */ friend scoreboard transport(const scoreboard &sb0, int delta[IDX(TGL_PIPE_ALL)]) { scoreboard sb; for (unsigned i = 0; i < ARRAY_SIZE(sb.grf_deps); i++) sb.grf_deps[i] = transport(sb0.grf_deps[i], delta); sb.addr_dep = transport(sb0.addr_dep, delta); sb.accum_dep = transport(sb0.accum_dep, delta); return sb; } friend bool operator==(const scoreboard &sb0, const scoreboard &sb1) { for (unsigned i = 0; i < ARRAY_SIZE(sb0.grf_deps); i++) { if (sb0.grf_deps[i] != sb1.grf_deps[i]) return false; } if (sb0.addr_dep != sb1.addr_dep) return false; if (sb0.accum_dep != sb1.accum_dep) return false; return true; } friend bool operator!=(const scoreboard &sb0, const scoreboard &sb1) { return !(sb0 == sb1); } private: dependency grf_deps[BRW_MAX_GRF]; dependency addr_dep; dependency accum_dep; dependency * dep(const fs_reg &r) { const unsigned reg = (r.file == VGRF ? r.nr + r.offset / REG_SIZE : reg_offset(r) / REG_SIZE); return (r.file == VGRF || r.file == FIXED_GRF ? &grf_deps[reg] : r.file == MRF ? &grf_deps[GFX7_MRF_HACK_START + reg] : r.file == ARF && reg >= BRW_ARF_ADDRESS && reg < BRW_ARF_ACCUMULATOR ? &addr_dep : r.file == ARF && reg >= BRW_ARF_ACCUMULATOR && reg < BRW_ARF_FLAG ? &accum_dep : NULL); } }; /** * Dependency list handling. * @{ */ struct dependency_list { dependency_list() : deps(NULL), n(0) {} ~dependency_list() { free(deps); } void push_back(const dependency &dep) { deps = (dependency *)realloc(deps, (n + 1) * sizeof(*deps)); deps[n++] = dep; } unsigned size() const { return n; } const dependency & operator[](unsigned i) const { assert(i < n); return deps[i]; } dependency & operator[](unsigned i) { assert(i < n); return deps[i]; } private: dependency_list(const dependency_list &); dependency_list & operator=(const dependency_list &); dependency *deps; unsigned n; }; /** * Add dependency \p dep to the list of dependencies of an instruction * \p deps. */ void add_dependency(const unsigned *ids, dependency_list &deps, dependency dep) { if (is_valid(dep)) { /* Translate the unordered dependency token first in order to keep * the list minimally redundant. */ if (dep.unordered) dep.id = ids[dep.id]; /* Try to combine the specified dependency with any existing ones. */ for (unsigned i = 0; i < deps.size(); i++) { /* Don't combine otherwise matching dependencies if there is an * exec_all mismatch which would cause a SET dependency to gain an * exec_all flag, since that would prevent it from being baked * into the instruction we want to allocate an SBID for. */ if (deps[i].exec_all != dep.exec_all && (!deps[i].exec_all || (dep.unordered & TGL_SBID_SET)) && (!dep.exec_all || (deps[i].unordered & TGL_SBID_SET))) continue; if (dep.ordered && deps[i].ordered) { for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) deps[i].jp.jp[p] = MAX2(deps[i].jp.jp[p], dep.jp.jp[p]); deps[i].ordered |= dep.ordered; deps[i].exec_all |= dep.exec_all; dep.ordered = TGL_REGDIST_NULL; } if (dep.unordered && deps[i].unordered && deps[i].id == dep.id) { deps[i].unordered |= dep.unordered; deps[i].exec_all |= dep.exec_all; dep.unordered = TGL_SBID_NULL; } } /* Add it to the end of the list if necessary. */ if (is_valid(dep)) deps.push_back(dep); } } /** * Construct a tgl_swsb annotation encoding any ordered dependencies from * the dependency list \p deps of an instruction with ordered_address \p * jp. If \p exec_all is false only dependencies known to be executed with * channel masking applied will be considered in the calculation. */ tgl_swsb ordered_dependency_swsb(const dependency_list &deps, const ordered_address &jp, bool exec_all) { tgl_pipe p = TGL_PIPE_NONE; unsigned min_dist = ~0u; for (unsigned i = 0; i < deps.size(); i++) { if (deps[i].ordered && exec_all >= deps[i].exec_all) { for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) { const unsigned dist = jp.jp[q] - int64_t(deps[i].jp.jp[q]); const unsigned max_dist = (q == IDX(TGL_PIPE_LONG) ? 14 : 10); assert(jp.jp[q] > deps[i].jp.jp[q]); if (dist <= max_dist) { p = (p && IDX(p) != q ? TGL_PIPE_ALL : tgl_pipe(TGL_PIPE_FLOAT + q)); min_dist = MIN3(min_dist, dist, 7); } } } } return { p ? min_dist : 0, p }; } /** * Return whether the dependency list \p deps of an instruction with * ordered_address \p jp has any non-trivial ordered dependencies. If \p * exec_all is false only dependencies known to be executed with channel * masking applied will be considered in the calculation. */ bool find_ordered_dependency(const dependency_list &deps, const ordered_address &jp, bool exec_all) { return ordered_dependency_swsb(deps, jp, exec_all).regdist; } /** * Return the full tgl_sbid_mode bitset for the first unordered dependency * on the list \p deps that matches the specified tgl_sbid_mode, or zero if * no such dependency is present. If \p exec_all is false only * dependencies known to be executed with channel masking applied will be * considered in the calculation. */ tgl_sbid_mode find_unordered_dependency(const dependency_list &deps, tgl_sbid_mode unordered, bool exec_all) { if (unordered) { for (unsigned i = 0; i < deps.size(); i++) { if ((unordered & deps[i].unordered) && exec_all >= deps[i].exec_all) return deps[i].unordered; } } return TGL_SBID_NULL; } /** * Return the tgl_sbid_mode bitset of an unordered dependency from the list * \p deps that can be represented directly in the SWSB annotation of the * instruction without additional SYNC instructions, or zero if no such * dependency is present. */ tgl_sbid_mode baked_unordered_dependency_mode(const struct intel_device_info *devinfo, const fs_inst *inst, const dependency_list &deps, const ordered_address &jp) { const bool exec_all = inst->force_writemask_all; const bool has_ordered = find_ordered_dependency(deps, jp, exec_all); const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp, exec_all).pipe; if (find_unordered_dependency(deps, TGL_SBID_SET, exec_all)) return find_unordered_dependency(deps, TGL_SBID_SET, exec_all); else if (has_ordered && is_unordered(inst)) return TGL_SBID_NULL; else if (find_unordered_dependency(deps, TGL_SBID_DST, exec_all) && (!has_ordered || ordered_pipe == inferred_sync_pipe(devinfo, inst))) return find_unordered_dependency(deps, TGL_SBID_DST, exec_all); else if (!has_ordered) return find_unordered_dependency(deps, TGL_SBID_SRC, exec_all); else return TGL_SBID_NULL; } /** * Return whether an ordered dependency from the list \p deps can be * represented directly in the SWSB annotation of the instruction without * additional SYNC instructions. */ bool baked_ordered_dependency_mode(const struct intel_device_info *devinfo, const fs_inst *inst, const dependency_list &deps, const ordered_address &jp) { const bool exec_all = inst->force_writemask_all; const bool has_ordered = find_ordered_dependency(deps, jp, exec_all); const tgl_pipe ordered_pipe = ordered_dependency_swsb(deps, jp, exec_all).pipe; const tgl_sbid_mode unordered_mode = baked_unordered_dependency_mode(devinfo, inst, deps, jp); if (!has_ordered) return false; else if (!unordered_mode) return true; else return ordered_pipe == inferred_sync_pipe(devinfo, inst) && unordered_mode == (is_unordered(inst) ? TGL_SBID_SET : TGL_SBID_DST); } /** @} */ /** * Shader instruction dependency calculation. * @{ */ /** * Update scoreboard object \p sb to account for the execution of * instruction \p inst. */ void update_inst_scoreboard(const fs_visitor *shader, const ordered_address *jps, const fs_inst *inst, unsigned ip, scoreboard &sb) { const bool exec_all = inst->force_writemask_all; const struct intel_device_info *devinfo = shader->devinfo; const tgl_pipe p = inferred_exec_pipe(devinfo, inst); const ordered_address jp = p ? ordered_address(p, jps[ip].jp[IDX(p)]) : ordered_address(); /* Track any source registers that may be fetched asynchronously by this * instruction, otherwise clear the dependency in order to avoid * subsequent redundant synchronization. */ for (unsigned i = 0; i < inst->sources; i++) { const dependency rd_dep = (inst->is_payload(i) || inst->is_math()) ? dependency(TGL_SBID_SRC, ip, exec_all) : ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)) ? dependency(TGL_REGDIST_SRC, jp, exec_all) : dependency::done; for (unsigned j = 0; j < regs_read(inst, i); j++) sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep); } if (inst->reads_accumulator_implicitly()) sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all)); if (is_send(inst) && inst->base_mrf != -1) { const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all); for (unsigned j = 0; j < inst->mlen; j++) sb.set(brw_uvec_mrf(8, inst->base_mrf + j, 0), rd_dep); } /* Track any destination registers of this instruction. */ const dependency wr_dep = is_unordered(inst) ? dependency(TGL_SBID_DST, ip, exec_all) : ordered_unit(devinfo, inst, IDX(TGL_PIPE_ALL)) ? dependency(TGL_REGDIST_DST, jp, exec_all) : dependency(); if (inst->writes_accumulator_implicitly(devinfo)) sb.set(brw_acc_reg(8), wr_dep); if (is_valid(wr_dep) && inst->dst.file != BAD_FILE && !inst->dst.is_null()) { for (unsigned j = 0; j < regs_written(inst); j++) sb.set(byte_offset(inst->dst, REG_SIZE * j), wr_dep); } } /** * Calculate scoreboard objects locally that represent any pending (and * unconditionally resolved) dependencies at the end of each block of the * program. */ scoreboard * gather_block_scoreboards(const fs_visitor *shader, const ordered_address *jps) { scoreboard *sbs = new scoreboard[shader->cfg->num_blocks]; unsigned ip = 0; foreach_block_and_inst(block, fs_inst, inst, shader->cfg) update_inst_scoreboard(shader, jps, inst, ip++, sbs[block->num]); return sbs; } /** * Propagate data dependencies globally through the control flow graph * until a fixed point is reached. * * Calculates the set of dependencies potentially pending at the beginning * of each block, and returns it as an array of scoreboard objects. */ scoreboard * propagate_block_scoreboards(const fs_visitor *shader, const ordered_address *jps, equivalence_relation &eq) { const scoreboard *delta_sbs = gather_block_scoreboards(shader, jps); scoreboard *in_sbs = new scoreboard[shader->cfg->num_blocks]; scoreboard *out_sbs = new scoreboard[shader->cfg->num_blocks]; for (bool progress = true; progress;) { progress = false; foreach_block(block, shader->cfg) { const scoreboard sb = shadow(in_sbs[block->num], delta_sbs[block->num]); if (sb != out_sbs[block->num]) { foreach_list_typed(bblock_link, child_link, link, &block->children) { scoreboard &in_sb = in_sbs[child_link->block->num]; int delta[IDX(TGL_PIPE_ALL)]; for (unsigned p = 0; p < IDX(TGL_PIPE_ALL); p++) delta[p] = jps[child_link->block->start_ip].jp[p] - jps[block->end_ip].jp[p] - ordered_unit(shader->devinfo, static_cast(block->end()), p); in_sb = merge(eq, in_sb, transport(sb, delta)); } out_sbs[block->num] = sb; progress = true; } } } delete[] delta_sbs; delete[] out_sbs; return in_sbs; } /** * Return the list of potential dependencies of each instruction in the * shader based on the result of global dependency analysis. */ dependency_list * gather_inst_dependencies(const fs_visitor *shader, const ordered_address *jps) { const struct intel_device_info *devinfo = shader->devinfo; equivalence_relation eq(num_instructions(shader)); scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq); const unsigned *ids = eq.flatten(); dependency_list *deps = new dependency_list[num_instructions(shader)]; unsigned ip = 0; foreach_block_and_inst(block, fs_inst, inst, shader->cfg) { const bool exec_all = inst->force_writemask_all; const tgl_pipe p = inferred_exec_pipe(devinfo, inst); scoreboard &sb = sbs[block->num]; for (unsigned i = 0; i < inst->sources; i++) { for (unsigned j = 0; j < regs_read(inst, i); j++) add_dependency(ids, deps[ip], dependency_for_read( sb.get(byte_offset(inst->src[i], REG_SIZE * j)))); } if (inst->reads_accumulator_implicitly()) { /* Wa_22012725308: * * "When the accumulator registers are used as source and/or * destination, hardware does not ensure prevention of write * after read hazard across execution pipes." */ const dependency dep = sb.get(brw_acc_reg(8)); if (dep.ordered && !is_single_pipe(dep.jp, p)) add_dependency(ids, deps[ip], dep); } if (is_send(inst) && inst->base_mrf != -1) { for (unsigned j = 0; j < inst->mlen; j++) add_dependency(ids, deps[ip], dependency_for_read( sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0)))); } if (is_unordered(inst)) add_dependency(ids, deps[ip], dependency(TGL_SBID_SET, ip, exec_all)); if (!inst->no_dd_check) { if (inst->dst.file != BAD_FILE && !inst->dst.is_null() && !inst->dst.is_accumulator()) { for (unsigned j = 0; j < regs_written(inst); j++) { add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst, sb.get(byte_offset(inst->dst, REG_SIZE * j)))); } } if (inst->writes_accumulator_implicitly(devinfo) || inst->dst.is_accumulator()) { /* Wa_22012725308: * * "When the accumulator registers are used as source and/or * destination, hardware does not ensure prevention of write * after read hazard across execution pipes." */ const dependency dep = sb.get(brw_acc_reg(8)); if (dep.ordered && !is_single_pipe(dep.jp, p)) add_dependency(ids, deps[ip], dep); } if (is_send(inst) && inst->base_mrf != -1) { for (unsigned j = 0; j < inst->implied_mrf_writes(); j++) add_dependency(ids, deps[ip], dependency_for_write(devinfo, inst, sb.get(brw_uvec_mrf(8, inst->base_mrf + j, 0)))); } } update_inst_scoreboard(shader, jps, inst, ip, sb); ip++; } delete[] sbs; delete[] ids; return deps; } /** @} */ /** * Allocate SBID tokens to track the execution of every out-of-order * instruction of the shader. */ dependency_list * allocate_inst_dependencies(const fs_visitor *shader, const dependency_list *deps0) { /* XXX - Use bin-packing algorithm to assign hardware SBIDs optimally in * shaders with a large number of SEND messages. */ /* Allocate an unordered dependency ID to hardware SBID translation * table with as many entries as instructions there are in the shader, * which is the maximum number of unordered IDs we can find in the * program. */ unsigned *ids = new unsigned[num_instructions(shader)]; for (unsigned ip = 0; ip < num_instructions(shader); ip++) ids[ip] = ~0u; dependency_list *deps1 = new dependency_list[num_instructions(shader)]; unsigned next_id = 0; for (unsigned ip = 0; ip < num_instructions(shader); ip++) { for (unsigned i = 0; i < deps0[ip].size(); i++) { const dependency &dep = deps0[ip][i]; if (dep.unordered && ids[dep.id] == ~0u) ids[dep.id] = (next_id++) & 0xf; add_dependency(ids, deps1[ip], dep); } } delete[] ids; return deps1; } /** * Emit dependency information provided by \p deps into the shader, * inserting additional SYNC instructions for dependencies that can't be * represented directly by annotating existing instructions. */ void emit_inst_dependencies(fs_visitor *shader, const ordered_address *jps, const dependency_list *deps) { const struct intel_device_info *devinfo = shader->devinfo; unsigned ip = 0; foreach_block_and_inst_safe(block, fs_inst, inst, shader->cfg) { const bool exec_all = inst->force_writemask_all; const bool ordered_mode = baked_ordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]); const tgl_sbid_mode unordered_mode = baked_unordered_dependency_mode(devinfo, inst, deps[ip], jps[ip]); tgl_swsb swsb = !ordered_mode ? tgl_swsb() : ordered_dependency_swsb(deps[ip], jps[ip], exec_all); for (unsigned i = 0; i < deps[ip].size(); i++) { const dependency &dep = deps[ip][i]; if (dep.unordered) { if (unordered_mode == dep.unordered && exec_all >= dep.exec_all && !swsb.mode) { /* Bake unordered dependency into the instruction's SWSB if * possible, except in cases where the current instruction * isn't marked NoMask but the dependency is, since that * might lead to data coherency issues due to * Wa_1407528679. */ swsb.sbid = dep.id; swsb.mode = dep.unordered; } else { /* Emit dependency into the SWSB of an extra SYNC * instruction. */ const fs_builder ibld = fs_builder(shader, block, inst) .exec_all().group(1, 0); fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(), brw_imm_ud(TGL_SYNC_NOP)); sync->sched.sbid = dep.id; sync->sched.mode = dep.unordered; assert(!(sync->sched.mode & TGL_SBID_SET)); } } } for (unsigned i = 0; i < deps[ip].size(); i++) { const dependency &dep = deps[ip][i]; if (dep.ordered && find_ordered_dependency(deps[ip], jps[ip], true) && (!ordered_mode || dep.exec_all > exec_all)) { /* If the current instruction is not marked NoMask but an * ordered dependency is, perform the synchronization as a * separate NoMask SYNC instruction in order to avoid data * coherency issues due to Wa_1407528679. The similar * scenario with unordered dependencies should have been * handled above. */ const fs_builder ibld = fs_builder(shader, block, inst) .exec_all().group(1, 0); fs_inst *sync = ibld.emit(BRW_OPCODE_SYNC, ibld.null_reg_ud(), brw_imm_ud(TGL_SYNC_NOP)); sync->sched = ordered_dependency_swsb(deps[ip], jps[ip], true); break; } } /* Update the IR. */ inst->sched = swsb; inst->no_dd_check = inst->no_dd_clear = false; ip++; } } } bool fs_visitor::lower_scoreboard() { if (devinfo->ver >= 12) { const ordered_address *jps = ordered_inst_addresses(this); const dependency_list *deps0 = gather_inst_dependencies(this, jps); const dependency_list *deps1 = allocate_inst_dependencies(this, deps0); emit_inst_dependencies(this, jps, deps1); delete[] deps1; delete[] deps0; delete[] jps; } return true; }