/* * Copyright (C) 2021 Valve Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ir3_compiler.h" #include "ir3_ra.h" #include "ralloc.h" /* This pass "merges" compatible phi-web SSA values. First, we insert a bunch * of parallelcopy's to trivially turn the program into CSSA form. Then we * try to "merge" SSA def's into "merge sets" which could be allocated to a * single register in order to eliminate copies. First we merge phi nodes, * which should always succeed because of the parallelcopy's we inserted, and * then we try to coalesce the copies we introduced. * * The merged registers are used for three purposes: * * 1. We always use the same pvtmem slot for spilling all SSA defs in each * merge set. This prevents us from having to insert memory-to-memory copies * in the spiller and makes sure we don't insert unecessary copies. * 2. When two values are live at the same time, part of the same merge * set, and they overlap each other in the merge set, they always occupy * overlapping physical registers in RA. This reduces register pressure and * copies in several important scenarios: * - When sources of a collect are used later by something else, we don't * have to introduce copies. * - We can handle sequences of extracts that "explode" a vector into its * components without any additional copying. * 3. We use the merge sets for affinities in register allocation: That is, we * try to allocate all the definitions in the same merge set to the * same/compatible registers. This helps us e.g. allocate sources of a collect * to contiguous registers without too much special code in RA. * * In a "normal" register allocator, or when spilling, we'd just merge * registers in the same merge set to the same register, but with SSA-based * register allocation we may have to split the live interval. * * The implementation is based on "Revisiting Out-of-SSA Translation for * Correctness, CodeQuality, and Efficiency," and is broadly similar to the * implementation in nir_from_ssa, with the twist that we also try to coalesce * META_SPLIT and META_COLLECT. This makes this pass more complicated but * prevents us from needing to handle these specially in RA and the spiller, * which are already complicated enough. This also forces us to implement that * value-comparison optimization they explain, as without it we wouldn't be * able to coalesce META_SPLIT even in the simplest of cases. */ /* In order to dynamically reconstruct the dominance forest, we need the * instructions ordered by a preorder traversal of the dominance tree: */ static unsigned index_instrs(struct ir3_block *block, unsigned index) { foreach_instr (instr, &block->instr_list) instr->ip = index++; for (unsigned i = 0; i < block->dom_children_count; i++) index = index_instrs(block->dom_children[i], index); return index; } /* Definitions within a merge set are ordered by instr->ip as set above: */ static bool def_after(struct ir3_register *a, struct ir3_register *b) { return a->instr->ip > b->instr->ip; } static bool def_dominates(struct ir3_register *a, struct ir3_register *b) { if (def_after(a, b)) { return false; } else if (a->instr->block == b->instr->block) { return def_after(b, a); } else { return ir3_block_dominates(a->instr->block, b->instr->block); } } /* This represents a region inside a register. The offset is relative to the * start of the register, and offset + size <= size(reg). */ struct def_value { struct ir3_register *reg; unsigned offset, size; }; /* Chase any copies to get the source of a region inside a register. This is * Value(a) in the paper. */ static struct def_value chase_copies(struct def_value value) { while (true) { struct ir3_instruction *instr = value.reg->instr; if (instr->opc == OPC_META_SPLIT) { value.offset += instr->split.off * reg_elem_size(value.reg); value.reg = instr->srcs[0]->def; } else if (instr->opc == OPC_META_COLLECT) { if (value.offset % reg_elem_size(value.reg) != 0 || value.size > reg_elem_size(value.reg) || value.offset + value.size > reg_size(value.reg)) break; struct ir3_register *src = instr->srcs[value.offset / reg_elem_size(value.reg)]; if (!src->def) break; value.offset = 0; value.reg = src->def; } else { /* TODO: parallelcopy */ break; } } return value; } /* This represents an entry in the merge set, and consists of a register + * offset from the merge set base. */ struct merge_def { struct ir3_register *reg; unsigned offset; }; static bool can_skip_interference(const struct merge_def *a, const struct merge_def *b) { unsigned a_start = a->offset; unsigned b_start = b->offset; unsigned a_end = a_start + reg_size(a->reg); unsigned b_end = b_start + reg_size(b->reg); /* Registers that don't overlap never interfere */ if (a_end <= b_start || b_end <= a_start) return true; /* Disallow skipping interference unless one definition contains the * other. This restriction is important for register allocation, because * it means that at any given point in the program, the live values in a * given merge set will form a tree. If they didn't, then one live value * would partially overlap another, and they would have overlapping live * ranges because they're live at the same point. This simplifies register * allocation and spilling. */ if (!((a_start <= b_start && a_end >= b_end) || (b_start <= a_start && b_end >= a_end))) return false; /* For each register, chase the intersection of a and b to find the * ultimate source. */ unsigned start = MAX2(a_start, b_start); unsigned end = MIN2(a_end, b_end); struct def_value a_value = chase_copies((struct def_value){ .reg = a->reg, .offset = start - a_start, .size = end - start, }); struct def_value b_value = chase_copies((struct def_value){ .reg = b->reg, .offset = start - b_start, .size = end - start, }); return a_value.reg == b_value.reg && a_value.offset == b_value.offset; } static struct ir3_merge_set * get_merge_set(struct ir3_register *def) { if (def->merge_set) return def->merge_set; struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set); set->preferred_reg = ~0; set->interval_start = ~0; set->spill_slot = ~0; set->size = reg_size(def); set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2; set->regs_count = 1; set->regs = ralloc(set, struct ir3_register *); set->regs[0] = def; return set; } /* Merges b into a */ static struct ir3_merge_set * merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset) { if (b_offset < 0) return merge_merge_sets(b, a, -b_offset); struct ir3_register **new_regs = rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count); unsigned a_index = 0, b_index = 0, new_index = 0; for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) { if (b_index < b->regs_count && (a_index == a->regs_count || def_after(a->regs[a_index], b->regs[b_index]))) { new_regs[new_index] = b->regs[b_index++]; new_regs[new_index]->merge_set_offset += b_offset; } else { new_regs[new_index] = a->regs[a_index++]; } new_regs[new_index]->merge_set = a; } assert(new_index == a->regs_count + b->regs_count); /* Technically this should be the lcm, but because alignment is only 1 or * 2 so far this should be ok. */ a->alignment = MAX2(a->alignment, b->alignment); a->regs_count += b->regs_count; ralloc_free(a->regs); a->regs = new_regs; a->size = MAX2(a->size, b->size + b_offset); return a; } static bool merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset) { if (b_offset < 0) return merge_sets_interfere(live, b, a, -b_offset); struct merge_def dom[a->regs_count + b->regs_count]; unsigned a_index = 0, b_index = 0; int dom_index = -1; /* Reject trying to merge the sets if the alignment doesn't work out */ if (b_offset % a->alignment != 0) return true; while (a_index < a->regs_count || b_index < b->regs_count) { struct merge_def current; if (a_index == a->regs_count) { current.reg = b->regs[b_index]; current.offset = current.reg->merge_set_offset + b_offset; b_index++; } else if (b_index == b->regs_count) { current.reg = a->regs[a_index]; current.offset = current.reg->merge_set_offset; a_index++; } else { if (def_after(b->regs[b_index], a->regs[a_index])) { current.reg = a->regs[a_index]; current.offset = current.reg->merge_set_offset; a_index++; } else { current.reg = b->regs[b_index]; current.offset = current.reg->merge_set_offset + b_offset; b_index++; } } while (dom_index >= 0 && !def_dominates(dom[dom_index].reg, current.reg)) { dom_index--; } /* TODO: in the original paper, just dom[dom_index] needs to be * checked for interference. We implement the value-chasing extension * as well as support for sub-registers, which complicates this * significantly because it's no longer the case that if a dominates b * dominates c and a and b don't interfere then we only need to check * interference between b and c to be sure a and c don't interfere -- * this means we may have to check for interference against values * higher in the stack then dom[dom_index]. In the paper there's a * description of a way to do less interference tests with the * value-chasing extension, but we'd have to come up with something * ourselves for handling the similar problems that come up with * allowing values to contain subregisters. For now we just test * everything in the stack. */ for (int i = 0; i <= dom_index; i++) { if (can_skip_interference(¤t, &dom[i])) continue; /* Ok, now we actually have to check interference. Since we know * that dom[i] dominates current, this boils down to checking * whether dom[i] is live after current. */ if (ir3_def_live_after(live, dom[i].reg, current.reg->instr)) return true; } dom[++dom_index] = current; } return false; } static void try_merge_defs(struct ir3_liveness *live, struct ir3_register *a, struct ir3_register *b, unsigned b_offset) { struct ir3_merge_set *a_set = get_merge_set(a); struct ir3_merge_set *b_set = get_merge_set(b); if (a_set == b_set) { /* Note: Even in this case we may not always successfully be able to * coalesce this copy, if the offsets don't line up. But in any * case, we can't do anything. */ return; } int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset; if (!merge_sets_interfere(live, a_set, b_set, b_set_offset)) merge_merge_sets(a_set, b_set, b_set_offset); } void ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset) { struct ir3_merge_set *a_set = get_merge_set(a); struct ir3_merge_set *b_set = get_merge_set(b); if (a_set == b_set) return; int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset; merge_merge_sets(a_set, b_set, b_set_offset); } static void coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi) { for (unsigned i = 0; i < phi->srcs_count; i++) { if (phi->srcs[i]->def) try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0); } } static void aggressive_coalesce_parallel_copy(struct ir3_liveness *live, struct ir3_instruction *pcopy) { for (unsigned i = 0; i < pcopy->dsts_count; i++) { if (!(pcopy->srcs[i]->flags & IR3_REG_SSA)) continue; try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0); } } static void aggressive_coalesce_split(struct ir3_liveness *live, struct ir3_instruction *split) { try_merge_defs(live, split->srcs[0]->def, split->dsts[0], split->split.off * reg_elem_size(split->dsts[0])); } static void aggressive_coalesce_collect(struct ir3_liveness *live, struct ir3_instruction *collect) { for (unsigned i = 0, offset = 0; i < collect->srcs_count; offset += reg_elem_size(collect->srcs[i]), i++) { if (!(collect->srcs[i]->flags & IR3_REG_SSA)) continue; try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset); } } static void create_parallel_copy(struct ir3_block *block) { for (unsigned i = 0; i < 2; i++) { if (!block->successors[i]) continue; struct ir3_block *succ = block->successors[i]; unsigned pred_idx = ir3_block_get_pred_index(succ, block); unsigned phi_count = 0; foreach_instr (phi, &succ->instr_list) { if (phi->opc != OPC_META_PHI) break; /* Avoid undef */ if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) && !phi->srcs[pred_idx]->def) continue; /* We don't support critical edges. If we were to support them, * we'd need to insert parallel copies after the phi node to solve * the lost-copy problem. */ assert(i == 0 && !block->successors[1]); phi_count++; } if (phi_count == 0) continue; struct ir3_register *src[phi_count]; unsigned j = 0; foreach_instr (phi, &succ->instr_list) { if (phi->opc != OPC_META_PHI) break; if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) && !phi->srcs[pred_idx]->def) continue; src[j++] = phi->srcs[pred_idx]; } assert(j == phi_count); struct ir3_instruction *pcopy = ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count); for (j = 0; j < phi_count; j++) { struct ir3_register *reg = __ssa_dst(pcopy); reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY); reg->size = src[j]->size; reg->wrmask = src[j]->wrmask; } for (j = 0; j < phi_count; j++) { pcopy->srcs[pcopy->srcs_count++] = ir3_reg_clone(block->shader, src[j]); } j = 0; foreach_instr (phi, &succ->instr_list) { if (phi->opc != OPC_META_PHI) break; if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) && !phi->srcs[pred_idx]->def) continue; phi->srcs[pred_idx]->def = pcopy->dsts[j]; phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags; j++; } assert(j == phi_count); } } void ir3_create_parallel_copies(struct ir3 *ir) { foreach_block (block, &ir->block_list) { create_parallel_copy(block); } } static void index_merge_sets(struct ir3_liveness *live, struct ir3 *ir) { unsigned offset = 0; foreach_block (block, &ir->block_list) { foreach_instr (instr, &block->instr_list) { for (unsigned i = 0; i < instr->dsts_count; i++) { struct ir3_register *dst = instr->dsts[i]; unsigned dst_offset; struct ir3_merge_set *merge_set = dst->merge_set; unsigned size = reg_size(dst); if (merge_set) { if (merge_set->interval_start == ~0) { merge_set->interval_start = offset; offset += merge_set->size; } dst_offset = merge_set->interval_start + dst->merge_set_offset; } else { dst_offset = offset; offset += size; } dst->interval_start = dst_offset; dst->interval_end = dst_offset + size; } } } live->interval_offset = offset; } #define RESET "\x1b[0m" #define BLUE "\x1b[0;34m" #define SYN_SSA(x) BLUE x RESET static void dump_merge_sets(struct ir3 *ir) { d("merge sets:"); struct set *merge_sets = _mesa_pointer_set_create(NULL); foreach_block (block, &ir->block_list) { foreach_instr (instr, &block->instr_list) { for (unsigned i = 0; i < instr->dsts_count; i++) { struct ir3_register *dst = instr->dsts[i]; struct ir3_merge_set *merge_set = dst->merge_set; if (!merge_set || _mesa_set_search(merge_sets, merge_set)) continue; d("merge set, size %u, align %u:", merge_set->size, merge_set->alignment); for (unsigned j = 0; j < merge_set->regs_count; j++) { struct ir3_register *reg = merge_set->regs[j]; d("\t" SYN_SSA("ssa_%u") ":%u, offset %u", reg->instr->serialno, reg->name, reg->merge_set_offset); } _mesa_set_add(merge_sets, merge_set); } } } ralloc_free(merge_sets); } void ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir) { index_instrs(ir3_start_block(ir), 0); /* First pass: coalesce phis, which must be together. */ foreach_block (block, &ir->block_list) { foreach_instr (instr, &block->instr_list) { if (instr->opc != OPC_META_PHI) break; coalesce_phi(live, instr); } } /* Second pass: aggressively coalesce parallelcopy, split, collect */ foreach_block (block, &ir->block_list) { foreach_instr (instr, &block->instr_list) { switch (instr->opc) { case OPC_META_SPLIT: aggressive_coalesce_split(live, instr); break; case OPC_META_COLLECT: aggressive_coalesce_collect(live, instr); break; case OPC_META_PARALLEL_COPY: aggressive_coalesce_parallel_copy(live, instr); break; default: break; } } } index_merge_sets(live, ir); if (ir3_shader_debug & IR3_DBG_RAMSGS) dump_merge_sets(ir); }