2006-04-17 00:35:34 +00:00
|
|
|
//===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===//
|
2006-04-17 00:30:41 +00:00
|
|
|
//
|
2019-01-19 08:50:56 +00:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2006-04-17 00:30:41 +00:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file computes an optimal sequence of instructions for doing all shuffles
|
|
|
|
// of two 4-element vectors. With a release build and when configured to emit
|
|
|
|
// an altivec instruction table, this takes about 30s to run on a 2.7Ghz
|
|
|
|
// PowerPC G5.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2006-04-18 00:21:25 +00:00
|
|
|
#include <cassert>
|
2008-02-20 11:08:44 +00:00
|
|
|
#include <cstdlib>
|
2012-12-04 10:37:14 +00:00
|
|
|
#include <iomanip>
|
|
|
|
#include <iostream>
|
|
|
|
#include <vector>
|
2022-04-19 14:49:50 +01:00
|
|
|
|
|
|
|
#define GENERATE_NEON
|
|
|
|
#define GENERATE_NEON_INS
|
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
struct Operator;
|
|
|
|
|
|
|
|
// Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes
|
2009-08-21 12:39:38 +00:00
|
|
|
// an element from that value of the input vectors. A value of 8 means the
|
2006-04-17 00:30:41 +00:00
|
|
|
// entry is undefined.
|
|
|
|
|
|
|
|
// Mask manipulation functions.
|
2009-08-21 12:39:38 +00:00
|
|
|
static inline unsigned short MakeMask(unsigned V0, unsigned V1,
|
2006-04-17 00:30:41 +00:00
|
|
|
unsigned V2, unsigned V3) {
|
|
|
|
return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4));
|
|
|
|
}
|
|
|
|
|
|
|
|
/// getMaskElt - Return element N of the specified mask.
|
|
|
|
static unsigned getMaskElt(unsigned Mask, unsigned Elt) {
|
|
|
|
return (Mask >> ((3-Elt)*4)) & 0xF;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) {
|
|
|
|
unsigned FieldShift = ((3-Elt)*4);
|
|
|
|
return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reject elements where the values are 9-15.
|
|
|
|
static bool isValidMask(unsigned short Mask) {
|
|
|
|
unsigned short UndefBits = Mask & 0x8888;
|
|
|
|
return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// hasUndefElements - Return true if any of the elements in the mask are undefs
|
|
|
|
///
|
|
|
|
static bool hasUndefElements(unsigned short Mask) {
|
|
|
|
return (Mask & 0x8888) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// isOnlyLHSMask - Return true if this mask only refers to its LHS, not
|
|
|
|
/// including undef values..
|
|
|
|
static bool isOnlyLHSMask(unsigned short Mask) {
|
|
|
|
return (Mask & 0x4444) == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to
|
|
|
|
/// refer to the LHS only (for when one argument value is passed into the same
|
|
|
|
/// function twice).
|
2006-11-03 01:45:13 +00:00
|
|
|
#if 0
|
2006-04-17 00:30:41 +00:00
|
|
|
static unsigned short getLHSOnlyMask(unsigned short Mask) {
|
|
|
|
return Mask & 0xBBBB; // Keep only LHS and Undefs.
|
|
|
|
}
|
2006-11-03 01:45:13 +00:00
|
|
|
#endif
|
2006-04-17 00:30:41 +00:00
|
|
|
|
|
|
|
/// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4
|
|
|
|
/// bits) into a compressed 13-bit mask, where each elt is multiplied by 9.
|
|
|
|
static unsigned getCompressedMask(unsigned short Mask) {
|
2009-08-21 12:39:38 +00:00
|
|
|
return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 +
|
2006-04-17 00:30:41 +00:00
|
|
|
getMaskElt(Mask, 2)*9 + getMaskElt(Mask, 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void PrintMask(unsigned i, std::ostream &OS) {
|
|
|
|
OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0)))
|
|
|
|
<< "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1)))
|
|
|
|
<< "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2)))
|
|
|
|
<< "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3)))
|
|
|
|
<< ">";
|
|
|
|
}
|
|
|
|
|
|
|
|
/// ShuffleVal - This represents a shufflevector operation.
|
|
|
|
struct ShuffleVal {
|
|
|
|
Operator *Op; // The Operation used to generate this value.
|
2015-12-14 21:57:05 +00:00
|
|
|
unsigned Cost; // Number of instrs used to generate this value.
|
2006-04-17 00:30:41 +00:00
|
|
|
unsigned short Arg0, Arg1; // Input operands for this value.
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
ShuffleVal() : Cost(1000000) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/// ShufTab - This is the actual shuffle table that we are trying to generate.
|
|
|
|
///
|
|
|
|
static ShuffleVal ShufTab[65536];
|
|
|
|
|
|
|
|
/// TheOperators - All of the operators that this target supports.
|
|
|
|
static std::vector<Operator*> TheOperators;
|
|
|
|
|
|
|
|
/// Operator - This is a vector operation that is available for use.
|
|
|
|
struct Operator {
|
2015-12-14 21:57:05 +00:00
|
|
|
const char *Name;
|
2006-04-17 00:30:41 +00:00
|
|
|
unsigned short ShuffleMask;
|
|
|
|
unsigned short OpNum;
|
2009-08-21 12:41:03 +00:00
|
|
|
unsigned Cost;
|
|
|
|
|
|
|
|
Operator(unsigned short shufflemask, const char *name, unsigned opnum,
|
|
|
|
unsigned cost = 1)
|
2015-12-14 21:57:05 +00:00
|
|
|
: Name(name), ShuffleMask(shufflemask), OpNum(opnum),Cost(cost) {
|
2006-04-17 00:30:41 +00:00
|
|
|
TheOperators.push_back(this);
|
|
|
|
}
|
|
|
|
~Operator() {
|
|
|
|
assert(TheOperators.back() == this);
|
|
|
|
TheOperators.pop_back();
|
|
|
|
}
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
bool isOnlyLHSOperator() const {
|
|
|
|
return isOnlyLHSMask(ShuffleMask);
|
|
|
|
}
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
const char *getName() const { return Name; }
|
2009-08-21 12:41:03 +00:00
|
|
|
unsigned getCost() const { return Cost; }
|
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) {
|
|
|
|
// Extract the elements from LHSMask and RHSMask, as appropriate.
|
|
|
|
unsigned Result = 0;
|
|
|
|
for (unsigned i = 0; i != 4; ++i) {
|
|
|
|
unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF;
|
|
|
|
unsigned ResElt;
|
|
|
|
if (SrcElt < 4)
|
|
|
|
ResElt = getMaskElt(LHSMask, SrcElt);
|
|
|
|
else if (SrcElt < 8)
|
|
|
|
ResElt = getMaskElt(RHSMask, SrcElt-4);
|
|
|
|
else {
|
|
|
|
assert(SrcElt == 8 && "Bad src elt!");
|
|
|
|
ResElt = 8;
|
|
|
|
}
|
|
|
|
Result |= ResElt << (4*i);
|
|
|
|
}
|
|
|
|
return Result;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2022-04-19 14:49:50 +01:00
|
|
|
#ifdef GENERATE_NEON_INS
|
|
|
|
// Special case "insert" op identifier used below
|
|
|
|
static Operator InsOp(0, "ins", 15, 1);
|
|
|
|
#endif
|
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
static const char *getZeroCostOpName(unsigned short Op) {
|
|
|
|
if (ShufTab[Op].Arg0 == 0x0123)
|
|
|
|
return "LHS";
|
|
|
|
else if (ShufTab[Op].Arg0 == 0x4567)
|
|
|
|
return "RHS";
|
|
|
|
else {
|
|
|
|
assert(0 && "bad zero cost operation");
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void PrintOperation(unsigned ValNo, unsigned short Vals[]) {
|
|
|
|
unsigned short ThisOp = Vals[ValNo];
|
|
|
|
std::cerr << "t" << ValNo;
|
|
|
|
PrintMask(ThisOp, std::cerr);
|
|
|
|
std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "(";
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) {
|
|
|
|
std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0);
|
|
|
|
PrintMask(ShufTab[ThisOp].Arg0, std::cerr);
|
|
|
|
} else {
|
|
|
|
// Figure out what tmp # it is.
|
|
|
|
for (unsigned i = 0; ; ++i)
|
|
|
|
if (Vals[i] == ShufTab[ThisOp].Arg0) {
|
|
|
|
std::cerr << "t" << i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2022-04-19 14:49:50 +01:00
|
|
|
#ifdef GENERATE_NEON_INS
|
|
|
|
if (ShufTab[ThisOp].Op == &InsOp) {
|
|
|
|
std::cerr << ", lane " << ShufTab[ThisOp].Arg1;
|
|
|
|
} else
|
|
|
|
#endif
|
2006-04-17 00:30:41 +00:00
|
|
|
if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) {
|
|
|
|
std::cerr << ", ";
|
|
|
|
if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) {
|
|
|
|
std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1);
|
|
|
|
PrintMask(ShufTab[ThisOp].Arg1, std::cerr);
|
|
|
|
} else {
|
|
|
|
// Figure out what tmp # it is.
|
|
|
|
for (unsigned i = 0; ; ++i)
|
|
|
|
if (Vals[i] == ShufTab[ThisOp].Arg1) {
|
|
|
|
std::cerr << "t" << i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
std::cerr << ") ";
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned getNumEntered() {
|
|
|
|
unsigned Count = 0;
|
|
|
|
for (unsigned i = 0; i != 65536; ++i)
|
|
|
|
Count += ShufTab[i].Cost < 100;
|
|
|
|
return Count;
|
|
|
|
}
|
|
|
|
|
2009-08-21 12:39:38 +00:00
|
|
|
static void EvaluateOps(unsigned short Elt, unsigned short Vals[],
|
2006-04-17 00:30:41 +00:00
|
|
|
unsigned &NumVals) {
|
|
|
|
if (ShufTab[Elt].Cost == 0) return;
|
2022-04-19 14:49:50 +01:00
|
|
|
#ifdef GENERATE_NEON_INS
|
|
|
|
if (ShufTab[Elt].Op == &InsOp) {
|
|
|
|
EvaluateOps(ShufTab[Elt].Arg0, Vals, NumVals);
|
|
|
|
Vals[NumVals++] = Elt;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif
|
2006-04-17 00:30:41 +00:00
|
|
|
|
|
|
|
// If this value has already been evaluated, it is free. FIXME: match undefs.
|
|
|
|
for (unsigned i = 0, e = NumVals; i != e; ++i)
|
|
|
|
if (Vals[i] == Elt) return;
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
// Otherwise, get the operands of the value, then add it.
|
|
|
|
unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1;
|
|
|
|
if (ShufTab[Arg0].Cost)
|
|
|
|
EvaluateOps(Arg0, Vals, NumVals);
|
|
|
|
if (Arg0 != Arg1 && ShufTab[Arg1].Cost)
|
|
|
|
EvaluateOps(Arg1, Vals, NumVals);
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
Vals[NumVals++] = Elt;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int main() {
|
|
|
|
// Seed the table with accesses to the LHS and RHS.
|
|
|
|
ShufTab[0x0123].Cost = 0;
|
2014-06-08 22:29:17 +00:00
|
|
|
ShufTab[0x0123].Op = nullptr;
|
2006-04-17 00:30:41 +00:00
|
|
|
ShufTab[0x0123].Arg0 = 0x0123;
|
|
|
|
ShufTab[0x4567].Cost = 0;
|
2014-06-08 22:29:17 +00:00
|
|
|
ShufTab[0x4567].Op = nullptr;
|
2006-04-17 00:30:41 +00:00
|
|
|
ShufTab[0x4567].Arg0 = 0x4567;
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
// Seed the first-level of shuffles, shuffles whose inputs are the input to
|
|
|
|
// the vectorshuffle operation.
|
|
|
|
bool MadeChange = true;
|
|
|
|
unsigned OpCount = 0;
|
|
|
|
while (MadeChange) {
|
|
|
|
MadeChange = false;
|
|
|
|
++OpCount;
|
|
|
|
std::cerr << "Starting iteration #" << OpCount << " with "
|
|
|
|
<< getNumEntered() << " entries established.\n";
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
// Scan the table for two reasons: First, compute the maximum cost of any
|
|
|
|
// operation left in the table. Second, make sure that values with undefs
|
|
|
|
// have the cheapest alternative that they match.
|
|
|
|
unsigned MaxCost = ShufTab[0].Cost;
|
|
|
|
for (unsigned i = 1; i != 0x8889; ++i) {
|
|
|
|
if (!isValidMask(i)) continue;
|
|
|
|
if (ShufTab[i].Cost > MaxCost)
|
|
|
|
MaxCost = ShufTab[i].Cost;
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
// If this value has an undef, make it be computed the cheapest possible
|
|
|
|
// way of any of the things that it matches.
|
|
|
|
if (hasUndefElements(i)) {
|
|
|
|
// This code is a little bit tricky, so here's the idea: consider some
|
|
|
|
// permutation, like 7u4u. To compute the lowest cost for 7u4u, we
|
|
|
|
// need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If
|
|
|
|
// there are 3 undefs, the number rises to 729 entries we have to scan,
|
|
|
|
// and for the 4 undef case, we have to scan the whole table.
|
|
|
|
//
|
|
|
|
// Instead of doing this huge amount of scanning, we process the table
|
|
|
|
// entries *in order*, and use the fact that 'u' is 8, larger than any
|
|
|
|
// valid index. Given an entry like 7u4u then, we only need to scan
|
|
|
|
// 7[0-7]4u - 8 entries. We can get away with this, because we already
|
|
|
|
// know that each of 704u, 714u, 724u, etc contain the minimum value of
|
|
|
|
// all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively.
|
|
|
|
unsigned UndefIdx;
|
|
|
|
if (i & 0x8000)
|
|
|
|
UndefIdx = 0;
|
|
|
|
else if (i & 0x0800)
|
|
|
|
UndefIdx = 1;
|
|
|
|
else if (i & 0x0080)
|
|
|
|
UndefIdx = 2;
|
|
|
|
else if (i & 0x0008)
|
|
|
|
UndefIdx = 3;
|
|
|
|
else
|
|
|
|
abort();
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
unsigned MinVal = i;
|
|
|
|
unsigned MinCost = ShufTab[i].Cost;
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
// Scan the 8 entries.
|
|
|
|
for (unsigned j = 0; j != 8; ++j) {
|
|
|
|
unsigned NewElt = setMaskElt(i, UndefIdx, j);
|
|
|
|
if (ShufTab[NewElt].Cost < MinCost) {
|
|
|
|
MinCost = ShufTab[NewElt].Cost;
|
|
|
|
MinVal = NewElt;
|
|
|
|
}
|
|
|
|
}
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
// If we found something cheaper than what was here before, use it.
|
|
|
|
if (i != MinVal) {
|
|
|
|
MadeChange = true;
|
|
|
|
ShufTab[i] = ShufTab[MinVal];
|
|
|
|
}
|
2009-08-21 12:39:38 +00:00
|
|
|
}
|
2022-04-19 14:49:50 +01:00
|
|
|
#ifdef GENERATE_NEON_INS
|
|
|
|
else {
|
|
|
|
// Similarly, if we take the mask (eg 3,6,1,0) and take the cost with
|
|
|
|
// undef for each lane (eg u,6,1,0 or 3,u,1,0 etc), we can use a single
|
|
|
|
// lane insert to fixup the result.
|
|
|
|
for (unsigned LaneIdx = 0; LaneIdx < 4; LaneIdx++) {
|
|
|
|
if (getMaskElt(i, LaneIdx) == 8)
|
|
|
|
continue;
|
|
|
|
unsigned NewElt = setMaskElt(i, LaneIdx, 8);
|
|
|
|
if (ShufTab[NewElt].Cost + 1 < ShufTab[i].Cost) {
|
|
|
|
MadeChange = true;
|
|
|
|
ShufTab[i].Cost = ShufTab[NewElt].Cost + 1;
|
|
|
|
ShufTab[i].Op = &InsOp;
|
|
|
|
ShufTab[i].Arg0 = NewElt;
|
|
|
|
ShufTab[i].Arg1 = LaneIdx;
|
|
|
|
}
|
|
|
|
}
|
2022-05-17 18:16:45 +01:00
|
|
|
|
|
|
|
// Similar idea for using a D register mov, masking out 2 lanes to undef
|
|
|
|
for (unsigned LaneIdx = 0; LaneIdx < 4; LaneIdx += 2) {
|
|
|
|
unsigned Ln0 = getMaskElt(i, LaneIdx);
|
|
|
|
unsigned Ln1 = getMaskElt(i, LaneIdx + 1);
|
|
|
|
if ((Ln0 == 0 && Ln1 == 1) || (Ln0 == 2 && Ln1 == 3) ||
|
|
|
|
(Ln0 == 4 && Ln1 == 5) || (Ln0 == 6 && Ln1 == 7)) {
|
|
|
|
unsigned NewElt = setMaskElt(i, LaneIdx, 8);
|
|
|
|
NewElt = setMaskElt(NewElt, LaneIdx + 1, 8);
|
|
|
|
if (ShufTab[NewElt].Cost + 1 < ShufTab[i].Cost) {
|
|
|
|
MadeChange = true;
|
|
|
|
ShufTab[i].Cost = ShufTab[NewElt].Cost + 1;
|
|
|
|
ShufTab[i].Op = &InsOp;
|
|
|
|
ShufTab[i].Arg0 = NewElt;
|
|
|
|
ShufTab[i].Arg1 = (LaneIdx >> 1) | 0x4;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-04-19 14:49:50 +01:00
|
|
|
}
|
|
|
|
#endif
|
2006-04-17 00:30:41 +00:00
|
|
|
}
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
for (unsigned LHS = 0; LHS != 0x8889; ++LHS) {
|
|
|
|
if (!isValidMask(LHS)) continue;
|
|
|
|
if (ShufTab[LHS].Cost > 1000) continue;
|
|
|
|
|
|
|
|
// If nothing involving this operand could possibly be cheaper than what
|
|
|
|
// we already have, don't consider it.
|
|
|
|
if (ShufTab[LHS].Cost + 1 >= MaxCost)
|
|
|
|
continue;
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) {
|
|
|
|
Operator *Op = TheOperators[opnum];
|
2022-04-19 14:49:50 +01:00
|
|
|
#ifdef GENERATE_NEON_INS
|
|
|
|
if (Op == &InsOp)
|
|
|
|
continue;
|
|
|
|
#endif
|
2006-04-17 00:30:41 +00:00
|
|
|
|
|
|
|
// Evaluate op(LHS,LHS)
|
|
|
|
unsigned ResultMask = Op->getTransformedMask(LHS, LHS);
|
|
|
|
|
2009-08-21 12:41:03 +00:00
|
|
|
unsigned Cost = ShufTab[LHS].Cost + Op->getCost();
|
2006-04-17 00:30:41 +00:00
|
|
|
if (Cost < ShufTab[ResultMask].Cost) {
|
|
|
|
ShufTab[ResultMask].Cost = Cost;
|
|
|
|
ShufTab[ResultMask].Op = Op;
|
|
|
|
ShufTab[ResultMask].Arg0 = LHS;
|
|
|
|
ShufTab[ResultMask].Arg1 = LHS;
|
|
|
|
MadeChange = true;
|
|
|
|
}
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
// If this is a two input instruction, include the op(x,y) cases. If
|
|
|
|
// this is a one input instruction, skip this.
|
|
|
|
if (Op->isOnlyLHSOperator()) continue;
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
for (unsigned RHS = 0; RHS != 0x8889; ++RHS) {
|
|
|
|
if (!isValidMask(RHS)) continue;
|
|
|
|
if (ShufTab[RHS].Cost > 1000) continue;
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
// If nothing involving this operand could possibly be cheaper than
|
|
|
|
// what we already have, don't consider it.
|
|
|
|
if (ShufTab[RHS].Cost + 1 >= MaxCost)
|
|
|
|
continue;
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
|
|
|
|
// Evaluate op(LHS,RHS)
|
|
|
|
unsigned ResultMask = Op->getTransformedMask(LHS, RHS);
|
|
|
|
|
|
|
|
if (ShufTab[ResultMask].Cost <= OpCount ||
|
|
|
|
ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost ||
|
|
|
|
ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost)
|
|
|
|
continue;
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
// Figure out the cost to evaluate this, knowing that CSE's only need
|
|
|
|
// to be evaluated once.
|
|
|
|
unsigned short Vals[30];
|
|
|
|
unsigned NumVals = 0;
|
|
|
|
EvaluateOps(LHS, Vals, NumVals);
|
|
|
|
EvaluateOps(RHS, Vals, NumVals);
|
|
|
|
|
2009-08-21 12:41:03 +00:00
|
|
|
unsigned Cost = NumVals + Op->getCost();
|
2006-04-17 00:30:41 +00:00
|
|
|
if (Cost < ShufTab[ResultMask].Cost) {
|
|
|
|
ShufTab[ResultMask].Cost = Cost;
|
|
|
|
ShufTab[ResultMask].Op = Op;
|
|
|
|
ShufTab[ResultMask].Arg0 = LHS;
|
|
|
|
ShufTab[ResultMask].Arg1 = RHS;
|
|
|
|
MadeChange = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
std::cerr << "Finished Table has " << getNumEntered()
|
|
|
|
<< " entries established.\n";
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
unsigned CostArray[10] = { 0 };
|
|
|
|
|
|
|
|
// Compute a cost histogram.
|
|
|
|
for (unsigned i = 0; i != 65536; ++i) {
|
|
|
|
if (!isValidMask(i)) continue;
|
|
|
|
if (ShufTab[i].Cost > 9)
|
|
|
|
++CostArray[9];
|
|
|
|
else
|
|
|
|
++CostArray[ShufTab[i].Cost];
|
|
|
|
}
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
for (unsigned i = 0; i != 9; ++i)
|
|
|
|
if (CostArray[i])
|
|
|
|
std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n";
|
|
|
|
if (CostArray[9])
|
|
|
|
std::cout << "// " << CostArray[9] << " entries have higher cost!\n";
|
2009-08-21 12:39:38 +00:00
|
|
|
|
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
// Build up the table to emit.
|
|
|
|
std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n";
|
2006-04-17 00:33:35 +00:00
|
|
|
std::cout << "static const unsigned PerfectShuffleTable[6561+1] = {\n";
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
for (unsigned i = 0; i != 0x8889; ++i) {
|
|
|
|
if (!isValidMask(i)) continue;
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
// CostSat - The cost of this operation saturated to two bits.
|
|
|
|
unsigned CostSat = ShufTab[i].Cost;
|
2006-04-17 05:25:16 +00:00
|
|
|
if (CostSat > 4) CostSat = 4;
|
|
|
|
if (CostSat == 0) CostSat = 1;
|
|
|
|
--CostSat; // Cost is now between 0-3.
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0;
|
|
|
|
assert(OpNum < 16 && "Too few bits to encode operation!");
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
unsigned LHS = getCompressedMask(ShufTab[i].Arg0);
|
|
|
|
unsigned RHS = getCompressedMask(ShufTab[i].Arg1);
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
// Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of
|
|
|
|
// LHS, and 13 bits of RHS = 32 bits.
|
2006-04-17 05:05:52 +00:00
|
|
|
unsigned Val = (CostSat << 30) | (OpNum << 26) | (LHS << 13) | RHS;
|
2006-04-17 00:30:41 +00:00
|
|
|
|
2010-10-14 00:12:49 +00:00
|
|
|
std::cout << " " << std::setw(10) << Val << "U, // ";
|
2006-04-17 00:30:41 +00:00
|
|
|
PrintMask(i, std::cout);
|
|
|
|
std::cout << ": Cost " << ShufTab[i].Cost;
|
|
|
|
std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy");
|
|
|
|
std::cout << " ";
|
|
|
|
if (ShufTab[ShufTab[i].Arg0].Cost == 0) {
|
|
|
|
std::cout << getZeroCostOpName(ShufTab[i].Arg0);
|
|
|
|
} else {
|
|
|
|
PrintMask(ShufTab[i].Arg0, std::cout);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) {
|
|
|
|
std::cout << ", ";
|
|
|
|
if (ShufTab[ShufTab[i].Arg1].Cost == 0) {
|
|
|
|
std::cout << getZeroCostOpName(ShufTab[i].Arg1);
|
|
|
|
} else {
|
|
|
|
PrintMask(ShufTab[i].Arg1, std::cout);
|
|
|
|
}
|
|
|
|
}
|
2022-04-19 14:49:50 +01:00
|
|
|
#ifdef GENERATE_NEON_INS
|
|
|
|
else if (ShufTab[i].Op == &InsOp) {
|
|
|
|
std::cout << ", lane " << ShufTab[i].Arg1;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
std::cout << "\n";
|
2009-08-21 12:39:38 +00:00
|
|
|
}
|
2006-04-17 00:30:41 +00:00
|
|
|
std::cout << " 0\n};\n";
|
|
|
|
|
2022-01-07 00:39:13 -08:00
|
|
|
if (false) {
|
2006-04-17 00:30:41 +00:00
|
|
|
// Print out the table.
|
|
|
|
for (unsigned i = 0; i != 0x8889; ++i) {
|
|
|
|
if (!isValidMask(i)) continue;
|
|
|
|
if (ShufTab[i].Cost < 1000) {
|
|
|
|
PrintMask(i, std::cerr);
|
|
|
|
std::cerr << " - Cost " << ShufTab[i].Cost << " - ";
|
2009-08-21 12:39:38 +00:00
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
unsigned short Vals[30];
|
|
|
|
unsigned NumVals = 0;
|
|
|
|
EvaluateOps(i, Vals, NumVals);
|
|
|
|
|
|
|
|
for (unsigned j = 0, e = NumVals; j != e; ++j)
|
|
|
|
PrintOperation(j, Vals);
|
|
|
|
std::cerr << "\n";
|
|
|
|
}
|
|
|
|
}
|
2020-03-25 14:04:59 -04:00
|
|
|
}
|
2006-04-17 00:30:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-04-17 00:47:18 +00:00
|
|
|
#ifdef GENERATE_ALTIVEC
|
2006-04-17 00:30:41 +00:00
|
|
|
|
|
|
|
///===---------------------------------------------------------------------===//
|
|
|
|
/// The altivec instruction definitions. This is the altivec-specific part of
|
|
|
|
/// this file.
|
|
|
|
///===---------------------------------------------------------------------===//
|
|
|
|
|
2006-04-17 00:47:18 +00:00
|
|
|
// Note that the opcode numbers here must match those in the PPC backend.
|
|
|
|
enum {
|
|
|
|
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
|
|
|
|
OP_VMRGHW,
|
|
|
|
OP_VMRGLW,
|
|
|
|
OP_VSPLTISW0,
|
|
|
|
OP_VSPLTISW1,
|
|
|
|
OP_VSPLTISW2,
|
|
|
|
OP_VSPLTISW3,
|
|
|
|
OP_VSLDOI4,
|
|
|
|
OP_VSLDOI8,
|
2006-05-24 17:04:05 +00:00
|
|
|
OP_VSLDOI12
|
2006-04-17 00:47:18 +00:00
|
|
|
};
|
|
|
|
|
2006-04-17 00:30:41 +00:00
|
|
|
struct vmrghw : public Operator {
|
2006-04-17 00:47:18 +00:00
|
|
|
vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW) {}
|
2006-04-17 00:30:41 +00:00
|
|
|
} the_vmrghw;
|
|
|
|
|
|
|
|
struct vmrglw : public Operator {
|
2006-04-17 00:47:18 +00:00
|
|
|
vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW) {}
|
2006-04-17 00:30:41 +00:00
|
|
|
} the_vmrglw;
|
|
|
|
|
|
|
|
template<unsigned Elt>
|
|
|
|
struct vspltisw : public Operator {
|
2006-04-17 00:47:18 +00:00
|
|
|
vspltisw(const char *N, unsigned Opc)
|
|
|
|
: Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
|
2006-04-17 00:30:41 +00:00
|
|
|
};
|
|
|
|
|
2006-04-17 00:47:18 +00:00
|
|
|
vspltisw<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0);
|
|
|
|
vspltisw<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1);
|
|
|
|
vspltisw<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2);
|
|
|
|
vspltisw<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3);
|
2006-04-17 00:30:41 +00:00
|
|
|
|
|
|
|
template<unsigned N>
|
|
|
|
struct vsldoi : public Operator {
|
2006-04-17 00:47:18 +00:00
|
|
|
vsldoi(const char *Name, unsigned Opc)
|
|
|
|
: Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
|
2006-04-17 00:30:41 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2006-04-17 00:47:18 +00:00
|
|
|
vsldoi<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4);
|
|
|
|
vsldoi<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8);
|
|
|
|
vsldoi<3> the_vsldoi3("vsldoi12", OP_VSLDOI12);
|
2006-04-17 00:30:41 +00:00
|
|
|
|
2006-04-17 00:47:18 +00:00
|
|
|
#endif
|
2009-08-21 12:41:24 +00:00
|
|
|
|
|
|
|
#ifdef GENERATE_NEON
|
|
|
|
enum {
|
|
|
|
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
|
|
|
|
OP_VREV,
|
|
|
|
OP_VDUP0,
|
|
|
|
OP_VDUP1,
|
|
|
|
OP_VDUP2,
|
|
|
|
OP_VDUP3,
|
|
|
|
OP_VEXT1,
|
|
|
|
OP_VEXT2,
|
|
|
|
OP_VEXT3,
|
|
|
|
OP_VUZPL, // VUZP, left result
|
|
|
|
OP_VUZPR, // VUZP, right result
|
|
|
|
OP_VZIPL, // VZIP, left result
|
|
|
|
OP_VZIPR, // VZIP, right result
|
|
|
|
OP_VTRNL, // VTRN, left result
|
|
|
|
OP_VTRNR // VTRN, right result
|
|
|
|
};
|
|
|
|
|
|
|
|
struct vrev : public Operator {
|
2011-05-18 06:42:21 +00:00
|
|
|
vrev() : Operator(0x1032, "vrev", OP_VREV) {}
|
2009-08-21 12:41:24 +00:00
|
|
|
} the_vrev;
|
|
|
|
|
|
|
|
template<unsigned Elt>
|
|
|
|
struct vdup : public Operator {
|
|
|
|
vdup(const char *N, unsigned Opc)
|
|
|
|
: Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
vdup<0> the_vdup0("vdup0", OP_VDUP0);
|
|
|
|
vdup<1> the_vdup1("vdup1", OP_VDUP1);
|
|
|
|
vdup<2> the_vdup2("vdup2", OP_VDUP2);
|
|
|
|
vdup<3> the_vdup3("vdup3", OP_VDUP3);
|
|
|
|
|
|
|
|
template<unsigned N>
|
|
|
|
struct vext : public Operator {
|
|
|
|
vext(const char *Name, unsigned Opc)
|
|
|
|
: Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
vext<1> the_vext1("vext1", OP_VEXT1);
|
|
|
|
vext<2> the_vext2("vext2", OP_VEXT2);
|
|
|
|
vext<3> the_vext3("vext3", OP_VEXT3);
|
|
|
|
|
|
|
|
struct vuzpl : public Operator {
|
[AArch64] Cost all perfect shuffles entries as cost 1
A brief introduction to perfect shuffles - AArch64 NEON has a number of
shuffle operations - dups, zips, exts, movs etc that can in some way
shuffle around the lanes of a vector. Given a shuffle of size 4 with 2
inputs, some shuffle masks can be easily codegen'd to a single
instruction. A <0,0,1,1> mask for example is a zip LHS, LHS. This is
great, but some masks are not so simple, like a <0,0,1,2>. It turns out
we can generate that from zip LHS, <0,2,0,2>, having generated
<0,2,0,2> from uzp LHS, LHS, producing the result in 2 instructions.
It is not obvious from a given mask how to get there though. So we have
a simple program (PerfectShuffle.cpp in the util folder) that can scan
through all combinations of 4-element vectors and generate the perfect
combination of results needed for each shuffle mask (for some definition
of perfect). This is run offline to generate a table that is queried for
generating shuffle instructions. (Because the table could get quite big,
it is limited to 4 element vectors).
In the perfect shuffle tables zip, unz and trn shuffles were being cost
as 2, which is higher than needed and skews the perfect shuffle tables
to create inefficient combinations. This sets them to 1 and regenerates
the tables. The codegen will usually be better and the costs should be
more precise (but it can get less second-order re-use of values from
multiple shuffles, these cases should be fixed up in subsequent patches.
Differential Revision: https://reviews.llvm.org/D123379
2022-04-19 12:05:05 +01:00
|
|
|
vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL, 1) {}
|
2009-08-21 12:41:24 +00:00
|
|
|
} the_vuzpl;
|
|
|
|
|
|
|
|
struct vuzpr : public Operator {
|
[AArch64] Cost all perfect shuffles entries as cost 1
A brief introduction to perfect shuffles - AArch64 NEON has a number of
shuffle operations - dups, zips, exts, movs etc that can in some way
shuffle around the lanes of a vector. Given a shuffle of size 4 with 2
inputs, some shuffle masks can be easily codegen'd to a single
instruction. A <0,0,1,1> mask for example is a zip LHS, LHS. This is
great, but some masks are not so simple, like a <0,0,1,2>. It turns out
we can generate that from zip LHS, <0,2,0,2>, having generated
<0,2,0,2> from uzp LHS, LHS, producing the result in 2 instructions.
It is not obvious from a given mask how to get there though. So we have
a simple program (PerfectShuffle.cpp in the util folder) that can scan
through all combinations of 4-element vectors and generate the perfect
combination of results needed for each shuffle mask (for some definition
of perfect). This is run offline to generate a table that is queried for
generating shuffle instructions. (Because the table could get quite big,
it is limited to 4 element vectors).
In the perfect shuffle tables zip, unz and trn shuffles were being cost
as 2, which is higher than needed and skews the perfect shuffle tables
to create inefficient combinations. This sets them to 1 and regenerates
the tables. The codegen will usually be better and the costs should be
more precise (but it can get less second-order re-use of values from
multiple shuffles, these cases should be fixed up in subsequent patches.
Differential Revision: https://reviews.llvm.org/D123379
2022-04-19 12:05:05 +01:00
|
|
|
vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR, 1) {}
|
2009-08-21 12:41:24 +00:00
|
|
|
} the_vuzpr;
|
|
|
|
|
|
|
|
struct vzipl : public Operator {
|
[AArch64] Cost all perfect shuffles entries as cost 1
A brief introduction to perfect shuffles - AArch64 NEON has a number of
shuffle operations - dups, zips, exts, movs etc that can in some way
shuffle around the lanes of a vector. Given a shuffle of size 4 with 2
inputs, some shuffle masks can be easily codegen'd to a single
instruction. A <0,0,1,1> mask for example is a zip LHS, LHS. This is
great, but some masks are not so simple, like a <0,0,1,2>. It turns out
we can generate that from zip LHS, <0,2,0,2>, having generated
<0,2,0,2> from uzp LHS, LHS, producing the result in 2 instructions.
It is not obvious from a given mask how to get there though. So we have
a simple program (PerfectShuffle.cpp in the util folder) that can scan
through all combinations of 4-element vectors and generate the perfect
combination of results needed for each shuffle mask (for some definition
of perfect). This is run offline to generate a table that is queried for
generating shuffle instructions. (Because the table could get quite big,
it is limited to 4 element vectors).
In the perfect shuffle tables zip, unz and trn shuffles were being cost
as 2, which is higher than needed and skews the perfect shuffle tables
to create inefficient combinations. This sets them to 1 and regenerates
the tables. The codegen will usually be better and the costs should be
more precise (but it can get less second-order re-use of values from
multiple shuffles, these cases should be fixed up in subsequent patches.
Differential Revision: https://reviews.llvm.org/D123379
2022-04-19 12:05:05 +01:00
|
|
|
vzipl() : Operator(0x0415, "vzipl", OP_VZIPL, 1) {}
|
2009-08-21 12:41:24 +00:00
|
|
|
} the_vzipl;
|
|
|
|
|
|
|
|
struct vzipr : public Operator {
|
[AArch64] Cost all perfect shuffles entries as cost 1
A brief introduction to perfect shuffles - AArch64 NEON has a number of
shuffle operations - dups, zips, exts, movs etc that can in some way
shuffle around the lanes of a vector. Given a shuffle of size 4 with 2
inputs, some shuffle masks can be easily codegen'd to a single
instruction. A <0,0,1,1> mask for example is a zip LHS, LHS. This is
great, but some masks are not so simple, like a <0,0,1,2>. It turns out
we can generate that from zip LHS, <0,2,0,2>, having generated
<0,2,0,2> from uzp LHS, LHS, producing the result in 2 instructions.
It is not obvious from a given mask how to get there though. So we have
a simple program (PerfectShuffle.cpp in the util folder) that can scan
through all combinations of 4-element vectors and generate the perfect
combination of results needed for each shuffle mask (for some definition
of perfect). This is run offline to generate a table that is queried for
generating shuffle instructions. (Because the table could get quite big,
it is limited to 4 element vectors).
In the perfect shuffle tables zip, unz and trn shuffles were being cost
as 2, which is higher than needed and skews the perfect shuffle tables
to create inefficient combinations. This sets them to 1 and regenerates
the tables. The codegen will usually be better and the costs should be
more precise (but it can get less second-order re-use of values from
multiple shuffles, these cases should be fixed up in subsequent patches.
Differential Revision: https://reviews.llvm.org/D123379
2022-04-19 12:05:05 +01:00
|
|
|
vzipr() : Operator(0x2637, "vzipr", OP_VZIPR, 1) {}
|
2009-08-21 12:41:24 +00:00
|
|
|
} the_vzipr;
|
|
|
|
|
|
|
|
struct vtrnl : public Operator {
|
[AArch64] Cost all perfect shuffles entries as cost 1
A brief introduction to perfect shuffles - AArch64 NEON has a number of
shuffle operations - dups, zips, exts, movs etc that can in some way
shuffle around the lanes of a vector. Given a shuffle of size 4 with 2
inputs, some shuffle masks can be easily codegen'd to a single
instruction. A <0,0,1,1> mask for example is a zip LHS, LHS. This is
great, but some masks are not so simple, like a <0,0,1,2>. It turns out
we can generate that from zip LHS, <0,2,0,2>, having generated
<0,2,0,2> from uzp LHS, LHS, producing the result in 2 instructions.
It is not obvious from a given mask how to get there though. So we have
a simple program (PerfectShuffle.cpp in the util folder) that can scan
through all combinations of 4-element vectors and generate the perfect
combination of results needed for each shuffle mask (for some definition
of perfect). This is run offline to generate a table that is queried for
generating shuffle instructions. (Because the table could get quite big,
it is limited to 4 element vectors).
In the perfect shuffle tables zip, unz and trn shuffles were being cost
as 2, which is higher than needed and skews the perfect shuffle tables
to create inefficient combinations. This sets them to 1 and regenerates
the tables. The codegen will usually be better and the costs should be
more precise (but it can get less second-order re-use of values from
multiple shuffles, these cases should be fixed up in subsequent patches.
Differential Revision: https://reviews.llvm.org/D123379
2022-04-19 12:05:05 +01:00
|
|
|
vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL, 1) {}
|
2009-08-21 12:41:24 +00:00
|
|
|
} the_vtrnl;
|
|
|
|
|
|
|
|
struct vtrnr : public Operator {
|
[AArch64] Cost all perfect shuffles entries as cost 1
A brief introduction to perfect shuffles - AArch64 NEON has a number of
shuffle operations - dups, zips, exts, movs etc that can in some way
shuffle around the lanes of a vector. Given a shuffle of size 4 with 2
inputs, some shuffle masks can be easily codegen'd to a single
instruction. A <0,0,1,1> mask for example is a zip LHS, LHS. This is
great, but some masks are not so simple, like a <0,0,1,2>. It turns out
we can generate that from zip LHS, <0,2,0,2>, having generated
<0,2,0,2> from uzp LHS, LHS, producing the result in 2 instructions.
It is not obvious from a given mask how to get there though. So we have
a simple program (PerfectShuffle.cpp in the util folder) that can scan
through all combinations of 4-element vectors and generate the perfect
combination of results needed for each shuffle mask (for some definition
of perfect). This is run offline to generate a table that is queried for
generating shuffle instructions. (Because the table could get quite big,
it is limited to 4 element vectors).
In the perfect shuffle tables zip, unz and trn shuffles were being cost
as 2, which is higher than needed and skews the perfect shuffle tables
to create inefficient combinations. This sets them to 1 and regenerates
the tables. The codegen will usually be better and the costs should be
more precise (but it can get less second-order re-use of values from
multiple shuffles, these cases should be fixed up in subsequent patches.
Differential Revision: https://reviews.llvm.org/D123379
2022-04-19 12:05:05 +01:00
|
|
|
vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR, 1) {}
|
2009-08-21 12:41:24 +00:00
|
|
|
} the_vtrnr;
|
|
|
|
|
|
|
|
#endif
|