llvm-project/llvm/lib/IR/StructuralHash.cpp

//===-- StructuralHash.cpp - IR Hashing -------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "llvm/IR/StructuralHash.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"

using namespace llvm;

namespace {

// Basic hashing mechanism to detect structural change to the IR, used to verify
// pass return status consistency with actual change. In addition to being used
// by the MergeFunctions pass.

class StructuralHashImpl {
  uint64_t Hash;

  void hash(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); }

  // This will produce different values on 32-bit and 64-bit systens as
  // hash_combine returns a size_t. However, this is only used for
  // detailed hashing which, in-tree, only needs to distinguish between
  // differences in functions.
  template <typename T> void hashArbitaryType(const T &V) {
    hash(hash_combine(V));
  }

  void hashType(Type *ValueType) {
    hash(ValueType->getTypeID());
    if (ValueType->isIntegerTy())
      hash(ValueType->getIntegerBitWidth());
  }

public:
  StructuralHashImpl() : Hash(4) {}

  void updateOperand(Value *Operand) {
    hashType(Operand->getType());

    // The cases enumerated below are not exhaustive and are only aimed to
    // get decent coverage over the function.
    if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(Operand)) {
      hashArbitaryType(ConstInt->getValue());
    } else if (ConstantFP *ConstFP = dyn_cast<ConstantFP>(Operand)) {
      hashArbitaryType(ConstFP->getValue());
    } else if (Argument *Arg = dyn_cast<Argument>(Operand)) {
      hash(Arg->getArgNo());
    } else if (Function *Func = dyn_cast<Function>(Operand)) {
      // Hashing the name will be deterministic as LLVM's hashing infrastructure
      // has explicit support for hashing strings and will not simply hash
      // the pointer.
      hashArbitaryType(Func->getName());
    }
  }

  void updateInstruction(const Instruction &Inst, bool DetailedHash) {
    hash(Inst.getOpcode());

    if (!DetailedHash)
      return;

    hashType(Inst.getType());

    // Handle additional properties of specific instructions that cause
    // semantic differences in the IR.
    if (const auto *ComparisonInstruction = dyn_cast<CmpInst>(&Inst))
      hash(ComparisonInstruction->getPredicate());

    for (const auto &Op : Inst.operands())
      updateOperand(Op);
  }

  // A function hash is calculated by considering only the number of arguments
  // and whether a function is varargs, the order of basic blocks (given by the
  // successors of each basic block in depth first order), and the order of
  // opcodes of each instruction within each of these basic blocks. This mirrors
  // the strategy FunctionComparator::compare() uses to compare functions by
  // walking the BBs in depth first order and comparing each instruction in
  // sequence. Because this hash currently does not look at the operands, it is
  // insensitive to things such as the target of calls and the constants used in
  // the function, which makes it useful when possibly merging functions which
  // are the same modulo constants and call targets.
  //
  // Note that different users of StructuralHash will want different behavior
  // out of it (i.e., MergeFunctions will want something different from PM
  // expensive checks for pass modification status). When modifying this
  // function, most changes should be gated behind an option and enabled
  // selectively.
  void update(const Function &F, bool DetailedHash) {
    // Declarations don't affect analyses.
    if (F.isDeclaration())
      return;

    hash(0x62642d6b6b2d6b72); // Function header

    hash(F.isVarArg());
    hash(F.arg_size());

    SmallVector<const BasicBlock *, 8> BBs;
    SmallPtrSet<const BasicBlock *, 16> VisitedBBs;

    // Walk the blocks in the same order as
    // FunctionComparator::cmpBasicBlocks(), accumulating the hash of the
    // function "structure." (BB and opcode sequence)
    BBs.push_back(&F.getEntryBlock());
    VisitedBBs.insert(BBs[0]);
    while (!BBs.empty()) {
      const BasicBlock *BB = BBs.pop_back_val();

      // This random value acts as a block header, as otherwise the partition of
      // opcodes into BBs wouldn't affect the hash, only the order of the
      // opcodes
      hash(45798);
      for (auto &Inst : *BB)
        updateInstruction(Inst, DetailedHash);

      const Instruction *Term = BB->getTerminator();
      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
        if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
          continue;
        BBs.push_back(Term->getSuccessor(i));
      }
    }
  }

  void update(const GlobalVariable &GV) {
    // Declarations and used/compiler.used don't affect analyses.
    // Since there are several `llvm.*` metadata, like `llvm.embedded.object`,
    // we ignore anything with the `.llvm` prefix
    if (GV.isDeclaration() || GV.getName().starts_with("llvm."))
      return;
    hash(23456); // Global header
    hash(GV.getValueType()->getTypeID());
  }

  void update(const Module &M, bool DetailedHash) {
    for (const GlobalVariable &GV : M.globals())
      update(GV);
    for (const Function &F : M)
      update(F, DetailedHash);
  }

  uint64_t getHash() const { return Hash; }
};

} // namespace

IRHash llvm::StructuralHash(const Function &F, bool DetailedHash) {
  StructuralHashImpl H;
  H.update(F, DetailedHash);
  return H.getHash();
}

IRHash llvm::StructuralHash(const Module &M, bool DetailedHash) {
  StructuralHashImpl H;
  H.update(M, DetailedHash);
  return H.getHash();
}