-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[BOLT] Support pre-aggregated returns #143296
New issue
Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? # to your account
base: users/aaupov/spr/main.bolt-support-pre-aggregated-returns
Are you sure you want to change the base?
[BOLT] Support pre-aggregated returns #143296
Conversation
Created using spr 1.3.4
Created using spr 1.3.4
@llvm/pr-subscribers-bolt Author: Amir Ayupov (aaupov) ChangesIntel's Architectural LBR supports capturing branch type information
Linux kernel can preserve branch type when > - save_type: save branch type during sampling in case binary is not available later. This information is needed to disambiguate external returns (from This patch adds new pre-aggregated trace type (R). Test Plan: updated callcont-fallthru.s Full diff: https://github.com/llvm/llvm-project/pull/143296.diff 4 Files Affected:
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 96969cf53baca..ae66c58e127cd 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -109,6 +109,7 @@ class DataAggregator : public DataReader {
static constexpr const uint64_t BR_ONLY = -1ULL;
static constexpr const uint64_t FT_ONLY = -1ULL;
static constexpr const uint64_t FT_EXTERNAL_ORIGIN = -2ULL;
+ static constexpr const uint64_t BR_EXTERNAL_RETURN = -3ULL;
uint64_t Branch;
uint64_t From;
@@ -388,7 +389,7 @@ class DataAggregator : public DataReader {
/// File format syntax:
/// E <event>
/// S <start> <count>
- /// T <start> <end> <ft_end> <count>
+ /// [TR] <start> <end> <ft_end> <count>
/// B <start> <end> <count> <mispred_count>
/// [Ff] <start> <end> <count>
///
@@ -403,6 +404,7 @@ class DataAggregator : public DataReader {
/// jump to the block
/// T - an aggregated trace: branch from <start> to <end> with a fall-through
/// to <ft_end>
+ /// R - an aggregated trace originating at a return
///
/// <id> - build id of the object containing the address. We can skip it for
/// the main binary and use "X" for an unknown object. This will save some
@@ -532,6 +534,9 @@ inline raw_ostream &operator<<(raw_ostream &OS,
case DataAggregator::Trace::FT_ONLY:
case DataAggregator::Trace::FT_EXTERNAL_ORIGIN:
break;
+ case DataAggregator::Trace::BR_EXTERNAL_RETURN:
+ OS << "0 -> ";
+ break;
default:
OS << Twine::utohexstr(T.Branch) << " -> ";
}
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 11d282e98413b..c28dd6e57f8e4 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -1194,6 +1194,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
INVALID = 0,
EVENT_NAME, // E
TRACE, // T
+ RETURN, // R
SAMPLE, // S
BRANCH, // B
FT, // F
@@ -1224,6 +1225,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
Type = StringSwitch<AggregatedLBREntry>(Str)
.Case("T", TRACE)
+ .Case("R", RETURN)
.Case("S", SAMPLE)
.Case("E", EVENT_NAME)
.Case("B", BRANCH)
@@ -1237,7 +1239,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
}
using SSI = StringSwitch<int>;
- AddrNum = SSI(Str).Case("T", 3).Case("S", 1).Case("E", 0).Default(2);
+ AddrNum = SSI(Str).Cases("T", "R", 3).Case("S", 1).Case("E", 0).Default(2);
CounterNum = SSI(Str).Case("B", 2).Case("E", 0).Default(1);
}
@@ -1295,8 +1297,13 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
Addr[0] = Location(Type == FT ? Trace::FT_ONLY : Trace::FT_EXTERNAL_ORIGIN);
}
- if (Type == BRANCH) {
+ if (Type == BRANCH)
Addr[2] = Location(Trace::BR_ONLY);
+
+ if (Type == RETURN) {
+ if (!Addr[0]->Offset)
+ Addr[0]->Offset = Trace::BR_EXTERNAL_RETURN;
+ Returns.emplace(Addr[0]->Offset);
}
Trace T{Addr[0]->Offset, Addr[1]->Offset, Addr[2]->Offset};
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index c2ef024db9475..63142903c80d2 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -10,6 +10,8 @@
# RUN: link_fdata %s %t %t.pa-ret PREAGG-RET
# Trace from an external location to a landing pad/entry point call continuation
# RUN: link_fdata %s %t %t.pa-ext PREAGG-EXT
+# Return trace to a landing pad/entry point call continuation
+# RUN: link_fdata %s %t %t.pa-pret PREAGG-PRET
# RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
# RUN: llvm-strip --strip-unneeded %t -o %t.strip
@@ -38,6 +40,15 @@
# RUN: llvm-bolt %t.strip --pa -p %t.pa-ext -o %t.out \
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
+## Check pre-aggregated return traces from external location attach call
+## continuation fallthrough count to secondary entry point (unstripped)
+# RUN: llvm-bolt %t --pa -p %t.pa-pret -o %t.out \
+# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+## Check pre-aggregated return traces from external location attach call
+## continuation fallthrough count to landing pad (stripped, landing pad)
+# RUN: llvm-bolt %t.strip --pa -p %t.pa-pret -o %t.out \
+# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+
## Check pre-aggregated traces don't report zero-sized PLT fall-through as
## invalid trace
# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.pa-plt -o %t.out | FileCheck %s \
@@ -92,6 +103,8 @@ Ltmp4_br:
# PREAGG-RET: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
## Target is a secondary entry point (unstripped) or a landing pad (stripped)
# PREAGG-EXT: T X:0 #Ltmp3# #Ltmp3_br# 1
+## Pre-aggregated return trace
+# PREAGG-PRET: R X:0 #Ltmp3# #Ltmp3_br# 1
# CHECK-ATTACH: callq foo
# CHECK-ATTACH-NEXT: count: 1
diff --git a/bolt/test/link_fdata.py b/bolt/test/link_fdata.py
index 5a9752068bb9f..cb6b3c7baaab5 100755
--- a/bolt/test/link_fdata.py
+++ b/bolt/test/link_fdata.py
@@ -36,9 +36,9 @@
fdata_pat = re.compile(r"([01].*) (?P<mispred>\d+) (?P<exec>\d+)")
# Pre-aggregated profile:
-# {T|S|E|B|F|f} <start> [<end>] [<ft_end>] <count> [<mispred_count>]
+# {T|R|S|E|B|F|f} <start> [<end>] [<ft_end>] <count> [<mispred_count>]
# <loc>: [<id>:]<offset>
-preagg_pat = re.compile(r"(?P<type>[TSBFf]) (?P<offsets_count>.*)")
+preagg_pat = re.compile(r"(?P<type>[TRSBFf]) (?P<offsets_count>.*)")
# No-LBR profile:
# <is symbol?> <closest elf symbol or DSO name> <relative address> <count>
|
Intel's Architectural LBR supports capturing branch type information
as part of LBR stack (SDM Vol 3B, part 2, October 2024):
Linux kernel can preserve branch type when
save_type
is enabled,even if CPU does not support Architectural LBR:
https://github.com/torvalds/linux/blob/f09079bd04a924c72d555cd97942d5f8d7eca98c/tools/perf/Documentation/perf-record.txt#L457-L460
This information is needed to disambiguate external returns (from
DSO/JIT) to an entry point or a landing pad, when BOLT can't
disassemble the branch source.
This patch adds new pre-aggregated trace type (R).
Depends on #143295.
Test Plan: updated callcont-fallthru.s