Skip to content

Commit e714142

Browse files
committed
Change heartbeat thread controls
When enabling heartbeats, the user must specify: - heartbeat_s: jl_heartbeat() must be called at least once every heartbeat_s; if it isn't, a one-line heartbeat loss report is printed - show_tasks_after_n: after these many heartbeat_s have passed without jl_heartbeat() being called, print task backtraces and stop all reporting - reset_after_n: after these many heartbeat_s have passed with jl_heartbeat() being called, print a heartbeats recovered message and reset reporting
1 parent fdc8377 commit e714142

File tree

1 file changed

+46
-49
lines changed

1 file changed

+46
-49
lines changed

src/threading.c

+46-49
Original file line numberDiff line numberDiff line change
@@ -942,9 +942,9 @@ volatile int heartbeat_enabled;
942942
uv_sem_t heartbeat_on_sem, // jl_heartbeat_enable -> thread
943943
heartbeat_off_sem; // thread -> jl_heartbeat_enable
944944
int heartbeat_interval_s,
945-
n_loss_reports,
946-
reset_reporting_s;
947-
int last_report_s, report_interval_s, n_reported;
945+
tasks_after_n,
946+
reset_tasks_after_n;
947+
int tasks_showed, n_hbs_missed, n_hbs_recvd;
948948
_Atomic(int) heartbeats;
949949

950950
JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT;
@@ -963,21 +963,19 @@ void jl_init_heartbeat(void)
963963

964964
// enable/disable heartbeats
965965
// heartbeat_s: interval within which jl_heartbeat() must be called
966-
// n_reports: for one heartbeat loss interval, how many times to report
967-
// reset_reporting_after_s: how long to wait after a heartbeat loss
968-
// interval and a return to steady heartbeats, before resetting
969-
// reporting behavior
966+
// show_tasks_after_n: number of heartbeats missed before printing task backtraces
967+
// reset_after_n: number of heartbeats after which to reset
970968
//
971969
// When disabling heartbeats, the heartbeat thread must wake up,
972970
// find out that heartbeats are now diabled, and reset. For now, we
973971
// handle this by preventing re-enabling of heartbeats until this
974972
// completes.
975-
JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
976-
int reset_reporting_after_s)
973+
JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n,
974+
int reset_after_n)
977975
{
978976
if (heartbeat_s <= 0) {
979977
heartbeat_enabled = 0;
980-
heartbeat_interval_s = n_loss_reports = reset_reporting_s = 0;
978+
heartbeat_interval_s = tasks_after_n = reset_tasks_after_n = 0;
981979
}
982980
else {
983981
// must disable before enabling
@@ -991,10 +989,11 @@ JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
991989

992990
jl_atomic_store_relaxed(&heartbeats, 0);
993991
heartbeat_interval_s = heartbeat_s;
994-
n_loss_reports = n_reports;
995-
reset_reporting_s = reset_reporting_after_s;
996-
last_report_s = 0;
997-
report_interval_s = heartbeat_interval_s;
992+
tasks_after_n = show_tasks_after_n;
993+
reset_tasks_after_n = reset_after_n;
994+
tasks_showed = 0;
995+
n_hbs_missed = 0;
996+
n_hbs_recvd = 0;
998997
heartbeat_enabled = 1;
999998
uv_sem_post(&heartbeat_on_sem); // wake the heartbeat thread
1000999
}
@@ -1030,44 +1029,42 @@ void sleep_for(int secs, int nsecs)
10301029
uint8_t check_heartbeats(uint8_t gc_state)
10311030
{
10321031
int hb = jl_atomic_exchange(&heartbeats, 0);
1033-
uint64_t curr_s = jl_hrtime() / 1e9;
10341032

10351033
if (hb <= 0) {
1036-
// we didn't get a heartbeat in the last interval; should we report?
1037-
if (n_reported < n_loss_reports &&
1038-
curr_s - last_report_s >= report_interval_s) {
1039-
jl_task_t *ct = jl_current_task;
1040-
jl_ptls_t ptls = ct->ptls;
1041-
1042-
// exit GC-safe region to report then re-enter
1043-
jl_gc_safe_leave(ptls, gc_state);
1044-
jl_safe_printf("==== heartbeat loss ====\n");
1045-
jl_print_task_backtraces(0);
1046-
gc_state = jl_gc_safe_enter(ptls);
1047-
1048-
// we've reported
1049-
n_reported++;
1050-
1051-
// record the reporting time _after_ the report
1052-
last_report_s = jl_hrtime() / 1e9;
1053-
1054-
// double the reporting interval up to a maximum
1055-
if (report_interval_s < 60 * heartbeat_interval_s) {
1056-
report_interval_s *= 2;
1034+
// we didn't get a heartbeat
1035+
n_hbs_recvd = 0;
1036+
n_hbs_missed++;
1037+
1038+
// if we've printed task backtraces already, do nothing
1039+
if (!tasks_showed) {
1040+
// otherwise, at least show this message
1041+
jl_safe_printf("==== heartbeat loss (%ds) ====\n",
1042+
n_hbs_missed * heartbeat_interval_s);
1043+
// if we've missed enough heartbeats, print task backtraces
1044+
if (n_hbs_missed >= tasks_after_n) {
1045+
jl_task_t *ct = jl_current_task;
1046+
jl_ptls_t ptls = ct->ptls;
1047+
1048+
// exit GC-safe region to report then re-enter
1049+
jl_gc_safe_leave(ptls, gc_state);
1050+
jl_print_task_backtraces(0);
1051+
gc_state = jl_gc_safe_enter(ptls);
1052+
1053+
// we printed task backtraces
1054+
tasks_showed = 1;
10571055
}
10581056
}
1059-
// no heartbeats, don't change reporting state
1060-
return gc_state;
10611057
}
10621058
else {
1063-
// we got a heartbeat; reset the report count
1064-
n_reported = 0;
1065-
}
1066-
1067-
// reset the reporting interval only once we're steadily getting
1068-
// heartbeats for the requested reset interval
1069-
if (curr_s - reset_reporting_s > last_report_s) {
1070-
report_interval_s = heartbeat_interval_s;
1059+
// got a heartbeat
1060+
n_hbs_recvd++;
1061+
// if we'd printed task backtraces, check for reset
1062+
if (tasks_showed && n_hbs_recvd >= reset_tasks_after_n) {
1063+
tasks_showed = 0;
1064+
jl_safe_printf("==== heartbeats recovered (lost for %ds) ====\n",
1065+
n_hbs_missed * heartbeat_interval_s);
1066+
}
1067+
n_hbs_missed = 0;
10711068
}
10721069

10731070
return gc_state;
@@ -1076,7 +1073,7 @@ uint8_t check_heartbeats(uint8_t gc_state)
10761073
// heartbeat thread function
10771074
void jl_heartbeat_threadfun(void *arg)
10781075
{
1079-
int s, ns = 1e9 - 1, rs;
1076+
int s = 59, ns = 1e9 - 1, rs;
10801077
uint64_t t0, tchb;
10811078

10821079
// We need a TLS because backtraces are accumulated into ptls->bt_size
@@ -1134,8 +1131,8 @@ void jl_init_heartbeat(void)
11341131
{
11351132
}
11361133

1137-
JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
1138-
int reset_reporting_after_s)
1134+
JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n,
1135+
int reset_after_n)
11391136
{
11401137
return -1;
11411138
}

0 commit comments

Comments
 (0)