@@ -942,9 +942,9 @@ volatile int heartbeat_enabled;
942
942
uv_sem_t heartbeat_on_sem , // jl_heartbeat_enable -> thread
943
943
heartbeat_off_sem ; // thread -> jl_heartbeat_enable
944
944
int heartbeat_interval_s ,
945
- n_loss_reports ,
946
- reset_reporting_s ;
947
- int last_report_s , report_interval_s , n_reported ;
945
+ tasks_after_n ,
946
+ reset_tasks_after_n ;
947
+ int tasks_showed , n_hbs_missed , n_hbs_recvd ;
948
948
_Atomic(int ) heartbeats ;
949
949
950
950
JL_DLLEXPORT void jl_print_task_backtraces (int show_done ) JL_NOTSAFEPOINT ;
@@ -963,21 +963,19 @@ void jl_init_heartbeat(void)
963
963
964
964
// enable/disable heartbeats
965
965
// heartbeat_s: interval within which jl_heartbeat() must be called
966
- // n_reports: for one heartbeat loss interval, how many times to report
967
- // reset_reporting_after_s: how long to wait after a heartbeat loss
968
- // interval and a return to steady heartbeats, before resetting
969
- // reporting behavior
966
+ // show_tasks_after_n: number of heartbeats missed before printing task backtraces
967
+ // reset_after_n: number of heartbeats after which to reset
970
968
//
971
969
// When disabling heartbeats, the heartbeat thread must wake up,
972
970
// find out that heartbeats are now diabled, and reset. For now, we
973
971
// handle this by preventing re-enabling of heartbeats until this
974
972
// completes.
975
- JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int n_reports ,
976
- int reset_reporting_after_s )
973
+ JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int show_tasks_after_n ,
974
+ int reset_after_n )
977
975
{
978
976
if (heartbeat_s <= 0 ) {
979
977
heartbeat_enabled = 0 ;
980
- heartbeat_interval_s = n_loss_reports = reset_reporting_s = 0 ;
978
+ heartbeat_interval_s = tasks_after_n = reset_tasks_after_n = 0 ;
981
979
}
982
980
else {
983
981
// must disable before enabling
@@ -991,10 +989,11 @@ JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
991
989
992
990
jl_atomic_store_relaxed (& heartbeats , 0 );
993
991
heartbeat_interval_s = heartbeat_s ;
994
- n_loss_reports = n_reports ;
995
- reset_reporting_s = reset_reporting_after_s ;
996
- last_report_s = 0 ;
997
- report_interval_s = heartbeat_interval_s ;
992
+ tasks_after_n = show_tasks_after_n ;
993
+ reset_tasks_after_n = reset_after_n ;
994
+ tasks_showed = 0 ;
995
+ n_hbs_missed = 0 ;
996
+ n_hbs_recvd = 0 ;
998
997
heartbeat_enabled = 1 ;
999
998
uv_sem_post (& heartbeat_on_sem ); // wake the heartbeat thread
1000
999
}
@@ -1030,44 +1029,42 @@ void sleep_for(int secs, int nsecs)
1030
1029
uint8_t check_heartbeats (uint8_t gc_state )
1031
1030
{
1032
1031
int hb = jl_atomic_exchange (& heartbeats , 0 );
1033
- uint64_t curr_s = jl_hrtime () / 1e9 ;
1034
1032
1035
1033
if (hb <= 0 ) {
1036
- // we didn't get a heartbeat in the last interval; should we report?
1037
- if ( n_reported < n_loss_reports &&
1038
- curr_s - last_report_s >= report_interval_s ) {
1039
- jl_task_t * ct = jl_current_task ;
1040
- jl_ptls_t ptls = ct -> ptls ;
1041
-
1042
- // exit GC-safe region to report then re-enter
1043
- jl_gc_safe_leave ( ptls , gc_state );
1044
- jl_safe_printf ( "==== heartbeat loss ====\n" );
1045
- jl_print_task_backtraces ( 0 );
1046
- gc_state = jl_gc_safe_enter ( ptls );
1047
-
1048
- // we've reported
1049
- n_reported ++ ;
1050
-
1051
- // record the reporting time _after_ the report
1052
- last_report_s = jl_hrtime () / 1e9 ;
1053
-
1054
- // double the reporting interval up to a maximum
1055
- if ( report_interval_s < 60 * heartbeat_interval_s ) {
1056
- report_interval_s *= 2 ;
1034
+ // we didn't get a heartbeat
1035
+ n_hbs_recvd = 0 ;
1036
+ n_hbs_missed ++ ;
1037
+
1038
+ // if we've printed task backtraces already, do nothing
1039
+ if (! tasks_showed ) {
1040
+ // otherwise, at least show this message
1041
+ jl_safe_printf ( "==== heartbeat loss (%ds) ====\n" ,
1042
+ n_hbs_missed * heartbeat_interval_s );
1043
+ // if we've missed enough heartbeats, print task backtraces
1044
+ if ( n_hbs_missed >= tasks_after_n ) {
1045
+ jl_task_t * ct = jl_current_task ;
1046
+ jl_ptls_t ptls = ct -> ptls ;
1047
+
1048
+ // exit GC-safe region to report then re-enter
1049
+ jl_gc_safe_leave ( ptls , gc_state );
1050
+ jl_print_task_backtraces ( 0 ) ;
1051
+ gc_state = jl_gc_safe_enter ( ptls );
1052
+
1053
+ // we printed task backtraces
1054
+ tasks_showed = 1 ;
1057
1055
}
1058
1056
}
1059
- // no heartbeats, don't change reporting state
1060
- return gc_state ;
1061
1057
}
1062
1058
else {
1063
- // we got a heartbeat; reset the report count
1064
- n_reported = 0 ;
1065
- }
1066
-
1067
- // reset the reporting interval only once we're steadily getting
1068
- // heartbeats for the requested reset interval
1069
- if (curr_s - reset_reporting_s > last_report_s ) {
1070
- report_interval_s = heartbeat_interval_s ;
1059
+ // got a heartbeat
1060
+ n_hbs_recvd ++ ;
1061
+ // if we'd printed task backtraces, check for reset
1062
+ if (tasks_showed && n_hbs_recvd >= reset_tasks_after_n ) {
1063
+ tasks_showed = 0 ;
1064
+ jl_safe_printf ("==== heartbeats recovered (lost for %ds) ====\n" ,
1065
+ n_hbs_missed * heartbeat_interval_s );
1066
+ }
1067
+ n_hbs_missed = 0 ;
1071
1068
}
1072
1069
1073
1070
return gc_state ;
@@ -1076,7 +1073,7 @@ uint8_t check_heartbeats(uint8_t gc_state)
1076
1073
// heartbeat thread function
1077
1074
void jl_heartbeat_threadfun (void * arg )
1078
1075
{
1079
- int s , ns = 1e9 - 1 , rs ;
1076
+ int s = 59 , ns = 1e9 - 1 , rs ;
1080
1077
uint64_t t0 , tchb ;
1081
1078
1082
1079
// We need a TLS because backtraces are accumulated into ptls->bt_size
@@ -1134,8 +1131,8 @@ void jl_init_heartbeat(void)
1134
1131
{
1135
1132
}
1136
1133
1137
- JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int n_reports ,
1138
- int reset_reporting_after_s )
1134
+ JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int show_tasks_after_n ,
1135
+ int reset_after_n )
1139
1136
{
1140
1137
return -1 ;
1141
1138
}
0 commit comments