-
Notifications
You must be signed in to change notification settings - Fork 15.1k
Description
LLVM Versions Tested: 18, 19, 20, current tip
Issue Summary
LLVM components incorrectly detect supported CPU instructions, resulting in the generation of illegal instructions, resulting in faulting binaries.
Detailed Description
The compiler infrastructure appears to use static CPU model mappings to determine available instruction sets, instead of querying what the CPU actually supports (via /proc/cpuinfo
or hwcap on AArch64). This causes particular problems on:
- Arm-V9 CPUs from Qualcomm SoCs that do not implement SVE despite the Arm-V9 specification requiring it
- Potentially any system where the Linux kernel is not configured with
CONFIG_ARM64_SVE=Y
But is not necessarily limited to AArch64 or the above.
This affects instruction selection/codegen/runtime dispatching for all of LLVM like Clang, Flang, OpenMP, ORC JIT, etc.
Reproduction Steps
Below is a relatively minimal test case using ORC JIT that demonstrates the issue. A Termux environment on Android devices using Qualcomm chips is likely the easiest target for reproduction. This could also be reproduced with a vectorizable loop in C code, with Clang, using the "-march=native" flag.
#include <iostream>
#include <vector>
#include <string>
#include "llvm/ExecutionEngine/Orc/LLJIT.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/TargetParser/Host.h"
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
using namespace llvm::orc;
// Diagnostic handler to suppress remarks
class SilenceRemarksHandler : public DiagnosticHandler {
public:
bool handleDiagnostics(const DiagnosticInfo &DI) override {
// Ignore remarks, pass through other diagnostics
if (DI.getSeverity() == DS_Remark) {
return true;
}
return false;
}
};
std::unique_ptr<Module> createVectorModule(LLVMContext &Context) {
auto M = std::make_unique<Module>("VecTest", Context);
auto *FloatTy = Type::getFloatTy(Context);
auto *FloatPtrTy = PointerType::get(FloatTy, 0);
auto *Int32Ty = Type::getInt32Ty(Context);
FunctionType *FT = FunctionType::get(
Type::getVoidTy(Context),
{FloatPtrTy, FloatPtrTy, FloatPtrTy, Int32Ty},
false);
Function *F = Function::Create(FT, Function::ExternalLinkage, "vector_op", M.get());
F->addFnAttr(Attribute::NoUnwind);
auto Args = F->arg_begin();
Value *A = &*Args++;
Value *B = &*Args++;
Value *Result = &*Args++;
Value *Length = &*Args++;
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
BasicBlock *LoopBB = BasicBlock::Create(Context, "loop", F);
BasicBlock *ExitBB = BasicBlock::Create(Context, "exit", F);
IRBuilder<> Builder(Context);
Builder.SetInsertPoint(EntryBB);
Value *IndexAlloca = Builder.CreateAlloca(Int32Ty, nullptr, "i");
Builder.CreateStore(ConstantInt::get(Int32Ty, 0), IndexAlloca);
Builder.CreateBr(LoopBB);
Builder.SetInsertPoint(LoopBB);
Value *Index = Builder.CreateLoad(Int32Ty, IndexAlloca, "idx");
Value *LoopCond = Builder.CreateICmpSLT(Index, Length, "cond");
Value *APtr = Builder.CreateGEP(FloatTy, A, Index, "a_ptr");
Value *BPtr = Builder.CreateGEP(FloatTy, B, Index, "b_ptr");
Value *ResultPtr = Builder.CreateGEP(FloatTy, Result, Index, "result_ptr");
MDNode *AccessGroup = MDNode::get(Context, {});
Value *AVal = Builder.CreateLoad(FloatTy, APtr, "a_val");
Value *BVal = Builder.CreateLoad(FloatTy, BPtr, "b_val");
cast<Instruction>(AVal)->setMetadata("llvm.mem.parallel_loop_access", AccessGroup);
cast<Instruction>(BVal)->setMetadata("llvm.mem.parallel_loop_access", AccessGroup);
Value *Square = Builder.CreateFMul(AVal, AVal, "square");
Value *AddResult = Builder.CreateFAdd(Square, BVal, "add");
auto *StoreInst = Builder.CreateStore(AddResult, ResultPtr);
StoreInst->setMetadata("llvm.mem.parallel_loop_access", AccessGroup);
Value *NextIndex = Builder.CreateAdd(Index, ConstantInt::get(Int32Ty, 1), "next_idx");
Builder.CreateStore(NextIndex, IndexAlloca);
// Loop metadata to force vectorization
MDNode *ForcedVec = MDNode::get(Context, {
MDString::get(Context, "llvm.loop.vectorize.enable"),
ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 1))
});
MDNode *LoopID = MDNode::get(Context, {MDNode::get(Context, {}), ForcedVec});
LoopID->replaceOperandWith(0, LoopID);
Builder.CreateCondBr(LoopCond, LoopBB, ExitBB)->setMetadata("llvm.loop", LoopID);
Builder.SetInsertPoint(ExitBB);
Builder.CreateRetVoid();
verifyFunction(*F);
return M;
}
// Apply optimization passes to force vectorization
void optimizeModule(Module &M, TargetMachine *TM) {
PassBuilder PB;
LoopAnalysisManager LAM;
FunctionAnalysisManager FAM;
CGSCCAnalysisManager CGAM;
ModuleAnalysisManager MAM;
FAM.registerPass([&] { return TM->getTargetIRAnalysis(); });
PB.registerModuleAnalyses(MAM);
PB.registerCGSCCAnalyses(CGAM);
PB.registerFunctionAnalyses(FAM);
PB.registerLoopAnalyses(LAM);
PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
ModulePassManager MPM = PB.buildPerModuleDefaultPipeline(OptimizationLevel::O3);
MPM.run(M, MAM);
}
int main(int argc, char** argv) {
// Parse command line arguments
bool useNoSVE = false;
for (int i = 1; i < argc; i++) {
if (std::string(argv[i]) == "--use-nosve") {
useNoSVE = true;
}
}
InitializeNativeTarget();
InitializeNativeTargetAsmPrinter();
InitializeNativeTargetAsmParser();
// Silence remarks
LLVMContext Context;
Context.setDiagnosticHandler(std::make_unique<SilenceRemarksHandler>());
auto JTMB = cantFail(JITTargetMachineBuilder::detectHost());
JTMB.setCodeGenOptLevel(CodeGenOptLevel::Aggressive);
if (useNoSVE) {
JTMB.addFeatures(std::vector<std::string>{"-sve"});
}
std::unique_ptr<TargetMachine> TM(cantFail(JTMB.createTargetMachine()));
auto M = createVectorModule(Context);
M->setDataLayout(TM->createDataLayout());
// Apply optimization passes to ensure and force vectorization
optimizeModule(*M, TM.get());
// Set-up JIT compiled function
auto JIT = cantFail(LLJITBuilder().setJITTargetMachineBuilder(std::move(JTMB)).create());
cantFail(JIT->addIRModule(ThreadSafeModule(std::move(M), std::make_unique<LLVMContext>())));
auto VecOpAddr = cantFail(JIT->lookup("vector_op"));
auto *VectorOp = (void(*)(float*, float*, float*, int))VecOpAddr.getValue();
const int Length = 1024;
std::vector<float> A(Length), B(Length), Result(Length);
for (int i = 0; i < Length; i++) {
A[i] = i;
B[i] = i * 2;
}
// Execute JIT-compiled function
// It should fault with an illegal instruction on such devices
VectorOp(A.data(), B.data(), Result.data(), Length);
// Will only reach here if execution succeeds
std::cout << "Result[10]: " << Result[10] << std::endl;
return 0;
}
When executed normally, the program generates illegal instructions on hardware that meets the specified conditions. It will also accept an argument --use-nosve
to add -sve
to the JIT's features list which should cause it not crash.
Additional Context
Attempting to workaround this issue locally revealed frustrating inconsistentencies in how LLVM CPU features are specified across different LLVM interfaces:
-march=
-mcpu=
-Xclang -target-feature
llvm::orc::JITTargetMachineBuilder::addFeatures()
Each of these accepts a different set of feature flags with inconsistent naming conventions and limited documentation.