diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index 56eb463fc98fc..79a91861554df 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -223,7 +223,8 @@ class DataAggregator : public DataReader { bool recordExit(BinaryFunction &BF, uint64_t From, bool Mispred, uint64_t Count = 1) const; - /// Aggregation statistics + /// Branch stacks aggregation statistics + uint64_t NumTraces{0}; uint64_t NumInvalidTraces{0}; uint64_t NumLongRangeTraces{0}; /// Specifies how many samples were recorded in cold areas if we are dealing @@ -231,6 +232,7 @@ class DataAggregator : public DataReader { /// for the source of the branch to avoid counting cold activity twice (one /// for source and another for destination). uint64_t NumColdSamples{0}; + uint64_t NumTotalSamples{0}; /// Looks into system PATH for Linux Perf and set up the aggregator to use it void findPerfExecutable(); @@ -327,8 +329,8 @@ class DataAggregator : public DataReader { /// Parse a single LBR entry as output by perf script -Fbrstack ErrorOr parseLBREntry(); - /// Parse LBR sample, returns the number of traces. - uint64_t parseLBRSample(const PerfBranchSample &Sample, bool NeedsSkylakeFix); + /// Parse LBR sample. + void parseLBRSample(const PerfBranchSample &Sample, bool NeedsSkylakeFix); /// Parse and pre-aggregate branch events. std::error_code parseBranchEvents(); @@ -487,6 +489,13 @@ class DataAggregator : public DataReader { void dump(const PerfBranchSample &Sample) const; void dump(const PerfMemSample &Sample) const; + /// Profile diagnostics print methods + void printColdSamplesDiagnostic() const; + void printLongRangeTracesDiagnostic() const; + void printBranchSamplesDiagnostics() const; + void printBasicSamplesDiagnostics(uint64_t OutOfRangeSamples) const; + void printBranchStacksDiagnostics(uint64_t IgnoredSamples) const; + public: /// If perf.data was collected without build ids, the buildid-list may contain /// incomplete entries. Return true if the buffer containing diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index f016576ff61af..a8a187974418d 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -634,7 +634,7 @@ bool DataAggregator::doSample(BinaryFunction &OrigFunc, uint64_t Address, uint64_t Count) { BinaryFunction *ParentFunc = getBATParentFunction(OrigFunc); BinaryFunction &Func = ParentFunc ? *ParentFunc : OrigFunc; - if (ParentFunc) + if (ParentFunc || (BAT && !BAT->isBATFunction(OrigFunc.getAddress()))) NumColdSamples += Count; auto I = NamesToSamples.find(Func.getOneName()); @@ -756,12 +756,13 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, Addr = BAT->translate(Func->getAddress(), Addr, IsFrom); BinaryFunction *ParentFunc = getBATParentFunction(*Func); + if (IsFrom && + (ParentFunc || (BAT && !BAT->isBATFunction(Func->getAddress())))) + NumColdSamples += Count; + if (!ParentFunc) return std::pair{Func, IsRetOrCallCont}; - if (IsFrom) - NumColdSamples += Count; - return std::pair{ParentFunc, IsRetOrCallCont}; }; @@ -1422,9 +1423,8 @@ std::error_code DataAggregator::printLBRHeatMap() { return std::error_code(); } -uint64_t DataAggregator::parseLBRSample(const PerfBranchSample &Sample, - bool NeedsSkylakeFix) { - uint64_t NumTraces{0}; +void DataAggregator::parseLBRSample(const PerfBranchSample &Sample, + bool NeedsSkylakeFix) { // LBRs are stored in reverse execution order. NextLBR refers to the next // executed branch record. const LBREntry *NextLBR = nullptr; @@ -1487,7 +1487,83 @@ uint64_t DataAggregator::parseLBRSample(const PerfBranchSample &Sample, ++Info.TakenCount; Info.MispredCount += LBR.Mispred; } - return NumTraces; +} + +void DataAggregator::printColdSamplesDiagnostic() const { + if (NumColdSamples > 0) { + const float ColdSamples = NumColdSamples * 100.0f / NumTotalSamples; + outs() << "PERF2BOLT: " << NumColdSamples + << format(" (%.1f%%)", ColdSamples) + << " samples recorded in cold regions of split functions.\n"; + if (ColdSamples > 5.0f) + outs() + << "WARNING: The BOLT-processed binary where samples were collected " + "likely used bad data or your service observed a large shift in " + "profile. You may want to audit this\n"; + } +} + +void DataAggregator::printLongRangeTracesDiagnostic() const { + outs() << "PERF2BOLT: out of range traces involving unknown regions: " + << NumLongRangeTraces; + if (NumTraces > 0) + outs() << format(" (%.1f%%)", NumLongRangeTraces * 100.0f / NumTraces); + outs() << "\n"; +} + +static float printColoredPct(uint64_t Numerator, uint64_t Denominator, float T1, + float T2) { + if (Denominator == 0) { + outs() << "\n"; + return 0; + } + float Percent = Numerator * 100.0f / Denominator; + outs() << " ("; + if (outs().has_colors()) { + if (Percent > T2) + outs().changeColor(raw_ostream::RED); + else if (Percent > T1) + outs().changeColor(raw_ostream::YELLOW); + else + outs().changeColor(raw_ostream::GREEN); + } + outs() << format("%.1f%%", Percent); + if (outs().has_colors()) + outs().resetColor(); + outs() << ")\n"; + return Percent; +} + +void DataAggregator::printBranchSamplesDiagnostics() const { + outs() << "PERF2BOLT: traces mismatching disassembled function contents: " + << NumInvalidTraces; + if (printColoredPct(NumInvalidTraces, NumTraces, 5, 10) > 10) + outs() << "\n !! WARNING !! This high mismatch ratio indicates the input " + "binary is probably not the same binary used during profiling " + "collection. The generated data may be ineffective for improving " + "performance\n\n"; + printLongRangeTracesDiagnostic(); + printColdSamplesDiagnostic(); +} + +void DataAggregator::printBasicSamplesDiagnostics( + uint64_t OutOfRangeSamples) const { + outs() << "PERF2BOLT: out of range samples recorded in unknown regions: " + << OutOfRangeSamples; + if (printColoredPct(OutOfRangeSamples, NumTotalSamples, 40, 60) > 80) + outs() << "\n !! WARNING !! This high mismatch ratio indicates the input " + "binary is probably not the same binary used during profiling " + "collection. The generated data may be ineffective for improving " + "performance\n\n"; + printColdSamplesDiagnostic(); +} + +void DataAggregator::printBranchStacksDiagnostics( + uint64_t IgnoredSamples) const { + outs() << "PERF2BOLT: ignored samples: " << IgnoredSamples; + if (printColoredPct(IgnoredSamples, NumTotalSamples, 20, 50) > 50) + errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples " + "were attributed to the input binary\n"; } std::error_code DataAggregator::parseBranchEvents() { @@ -1495,11 +1571,9 @@ std::error_code DataAggregator::parseBranchEvents() { NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); - uint64_t NumTotalSamples = 0; uint64_t NumEntries = 0; uint64_t NumSamples = 0; uint64_t NumSamplesNoLBR = 0; - uint64_t NumTraces = 0; bool NeedsSkylakeFix = false; while (hasData() && NumTotalSamples < opts::MaxSamples) { @@ -1526,7 +1600,7 @@ std::error_code DataAggregator::parseBranchEvents() { NeedsSkylakeFix = true; } - NumTraces += parseLBRSample(Sample, NeedsSkylakeFix); + parseLBRSample(Sample, NeedsSkylakeFix); } for (const Trace &Trace : llvm::make_first_range(BranchLBRs)) @@ -1534,22 +1608,6 @@ std::error_code DataAggregator::parseBranchEvents() { if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr)) BF->setHasProfileAvailable(); - auto printColored = [](raw_ostream &OS, float Percent, float T1, float T2) { - OS << " ("; - if (OS.has_colors()) { - if (Percent > T2) - OS.changeColor(raw_ostream::RED); - else if (Percent > T1) - OS.changeColor(raw_ostream::YELLOW); - else - OS.changeColor(raw_ostream::GREEN); - } - OS << format("%.1f%%", Percent); - if (OS.has_colors()) - OS.resetColor(); - OS << ")"; - }; - outs() << "PERF2BOLT: read " << NumSamples << " samples and " << NumEntries << " LBR entries\n"; if (NumTotalSamples) { @@ -1561,47 +1619,10 @@ std::error_code DataAggregator::parseBranchEvents() { "in no-LBR mode with -nl (the performance improvement in -nl " "mode may be limited)\n"; } else { - const uint64_t IgnoredSamples = NumTotalSamples - NumSamples; - const float PercentIgnored = 100.0f * IgnoredSamples / NumTotalSamples; - outs() << "PERF2BOLT: " << IgnoredSamples << " samples"; - printColored(outs(), PercentIgnored, 20, 50); - outs() << " were ignored\n"; - if (PercentIgnored > 50.0f) - errs() << "PERF2BOLT-WARNING: less than 50% of all recorded samples " - "were attributed to the input binary\n"; + printBranchStacksDiagnostics(NumTotalSamples - NumSamples); } } - outs() << "PERF2BOLT: traces mismatching disassembled function contents: " - << NumInvalidTraces; - float Perc = 0.0f; - if (NumTraces > 0) { - Perc = NumInvalidTraces * 100.0f / NumTraces; - printColored(outs(), Perc, 5, 10); - } - outs() << "\n"; - if (Perc > 10.0f) - outs() << "\n !! WARNING !! This high mismatch ratio indicates the input " - "binary is probably not the same binary used during profiling " - "collection. The generated data may be ineffective for improving " - "performance.\n\n"; - - outs() << "PERF2BOLT: out of range traces involving unknown regions: " - << NumLongRangeTraces; - if (NumTraces > 0) - outs() << format(" (%.1f%%)", NumLongRangeTraces * 100.0f / NumTraces); - outs() << "\n"; - - if (NumColdSamples > 0) { - const float ColdSamples = NumColdSamples * 100.0f / NumTotalSamples; - outs() << "PERF2BOLT: " << NumColdSamples - << format(" (%.1f%%)", ColdSamples) - << " samples recorded in cold regions of split functions.\n"; - if (ColdSamples > 5.0f) - outs() - << "WARNING: The BOLT-processed binary where samples were collected " - "likely used bad data or your service observed a large shift in " - "profile. You may want to audit this.\n"; - } + printBranchSamplesDiagnostics(); return std::error_code(); } @@ -1658,11 +1679,10 @@ void DataAggregator::processBasicEvents() { NamedRegionTimer T("processBasic", "Processing basic events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); uint64_t OutOfRangeSamples = 0; - uint64_t NumSamples = 0; for (auto &Sample : BasicSamples) { const uint64_t PC = Sample.first; const uint64_t HitCount = Sample.second; - NumSamples += HitCount; + NumTotalSamples += HitCount; BinaryFunction *Func = getBinaryFunctionContainingAddress(PC); if (!Func) { OutOfRangeSamples += HitCount; @@ -1671,33 +1691,9 @@ void DataAggregator::processBasicEvents() { doSample(*Func, PC, HitCount); } - outs() << "PERF2BOLT: read " << NumSamples << " samples\n"; + outs() << "PERF2BOLT: read " << NumTotalSamples << " samples\n"; - outs() << "PERF2BOLT: out of range samples recorded in unknown regions: " - << OutOfRangeSamples; - float Perc = 0.0f; - if (NumSamples > 0) { - outs() << " ("; - Perc = OutOfRangeSamples * 100.0f / NumSamples; - if (outs().has_colors()) { - if (Perc > 60.0f) - outs().changeColor(raw_ostream::RED); - else if (Perc > 40.0f) - outs().changeColor(raw_ostream::YELLOW); - else - outs().changeColor(raw_ostream::GREEN); - } - outs() << format("%.1f%%", Perc); - if (outs().has_colors()) - outs().resetColor(); - outs() << ")"; - } - outs() << "\n"; - if (Perc > 80.0f) - outs() << "\n !! WARNING !! This high mismatch ratio indicates the input " - "binary is probably not the same binary used during profiling " - "collection. The generated data may be ineffective for improving " - "performance.\n\n"; + printBasicSamplesDiagnostics(OutOfRangeSamples); } std::error_code DataAggregator::parseMemEvents() { @@ -1775,13 +1771,13 @@ void DataAggregator::processPreAggregated() { NamedRegionTimer T("processAggregated", "Processing aggregated branch events", TimerGroupName, TimerGroupDesc, opts::TimeAggregator); - uint64_t NumTraces = 0; for (const AggregatedLBREntry &AggrEntry : AggregatedLBRs) { switch (AggrEntry.EntryType) { case AggregatedLBREntry::BRANCH: case AggregatedLBREntry::TRACE: doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count, AggrEntry.Mispreds); + NumTotalSamples += AggrEntry.Count; break; case AggregatedLBREntry::FT: case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: { @@ -1799,37 +1795,7 @@ void DataAggregator::processPreAggregated() { outs() << "PERF2BOLT: read " << AggregatedLBRs.size() << " aggregated LBR entries\n"; - outs() << "PERF2BOLT: traces mismatching disassembled function contents: " - << NumInvalidTraces; - float Perc = 0.0f; - if (NumTraces > 0) { - outs() << " ("; - Perc = NumInvalidTraces * 100.0f / NumTraces; - if (outs().has_colors()) { - if (Perc > 10.0f) - outs().changeColor(raw_ostream::RED); - else if (Perc > 5.0f) - outs().changeColor(raw_ostream::YELLOW); - else - outs().changeColor(raw_ostream::GREEN); - } - outs() << format("%.1f%%", Perc); - if (outs().has_colors()) - outs().resetColor(); - outs() << ")"; - } - outs() << "\n"; - if (Perc > 10.0f) - outs() << "\n !! WARNING !! This high mismatch ratio indicates the input " - "binary is probably not the same binary used during profiling " - "collection. The generated data may be ineffective for improving " - "performance.\n\n"; - - outs() << "PERF2BOLT: Out of range traces involving unknown regions: " - << NumLongRangeTraces; - if (NumTraces > 0) - outs() << format(" (%.1f%%)", NumLongRangeTraces * 100.0f / NumTraces); - outs() << "\n"; + printBranchSamplesDiagnostics(); } std::optional DataAggregator::parseCommExecEvent() {