Skip to content

Commit b00d0fa

Browse files
committed
Reliably signal coordinator thread on panic during ongoing codegen
Replace the separate AbortCodegenOnDrop guard by integrating this functionality into OngoingCodegen (or rather, the Coordinator part of it). This ensures that we send a CodegenAborted message and wait for workers to finish even if the panic occurs outside codegen_crate() (e.g. inside join_codegen()). This requires some minor changes to the handling of CodegenAborted, as it can now occur when the main thread is LLVMing rather than Codegenning.
1 parent 6a1f77d commit b00d0fa

File tree

2 files changed

+43
-76
lines changed

2 files changed

+43
-76
lines changed

compiler/rustc_codegen_ssa/src/back/write.rs

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ use rustc_target::spec::{MergeFunctions, SanitizerSet};
3737
use std::any::Any;
3838
use std::fs;
3939
use std::io;
40+
use std::marker::PhantomData;
4041
use std::mem;
4142
use std::path::{Path, PathBuf};
4243
use std::str;
@@ -475,10 +476,13 @@ pub fn start_async_codegen<B: ExtraBackendMethods>(
475476
metadata_module,
476477
crate_info,
477478

478-
coordinator_send,
479479
codegen_worker_receive,
480480
shared_emitter_main,
481-
future: coordinator_thread,
481+
coordinator: Coordinator {
482+
sender: coordinator_send,
483+
future: Some(coordinator_thread),
484+
phantom: PhantomData,
485+
},
482486
output_filenames: tcx.output_filenames(()).clone(),
483487
}
484488
}
@@ -1273,6 +1277,7 @@ fn start_executing_work<B: ExtraBackendMethods>(
12731277
// work to be done.
12741278
while !codegen_done
12751279
|| running > 0
1280+
|| main_thread_worker_state == MainThreadWorkerState::LLVMing
12761281
|| (!codegen_aborted
12771282
&& !(work_items.is_empty()
12781283
&& needs_fat_lto.is_empty()
@@ -1492,7 +1497,6 @@ fn start_executing_work<B: ExtraBackendMethods>(
14921497
assert!(!codegen_aborted);
14931498
codegen_done = true;
14941499
codegen_aborted = true;
1495-
assert_eq!(main_thread_worker_state, MainThreadWorkerState::Codegenning);
14961500
}
14971501
Message::Done { result: Ok(compiled_module), worker_id } => {
14981502
free_worker(worker_id);
@@ -1539,6 +1543,10 @@ fn start_executing_work<B: ExtraBackendMethods>(
15391543
}
15401544
}
15411545

1546+
if codegen_aborted {
1547+
return Err(());
1548+
}
1549+
15421550
let needs_link = mem::take(&mut needs_link);
15431551
if !needs_link.is_empty() {
15441552
assert!(compiled_modules.is_empty());
@@ -1828,25 +1836,47 @@ impl SharedEmitterMain {
18281836
}
18291837
}
18301838

1839+
pub struct Coordinator<B: ExtraBackendMethods> {
1840+
pub sender: Sender<Box<dyn Any + Send>>,
1841+
future: Option<thread::JoinHandle<Result<CompiledModules, ()>>>,
1842+
// Only used for the Message type.
1843+
phantom: PhantomData<B>,
1844+
}
1845+
1846+
impl<B: ExtraBackendMethods> Coordinator<B> {
1847+
fn join(mut self) -> std::thread::Result<Result<CompiledModules, ()>> {
1848+
self.future.take().unwrap().join()
1849+
}
1850+
}
1851+
1852+
impl<B: ExtraBackendMethods> Drop for Coordinator<B> {
1853+
fn drop(&mut self) {
1854+
if let Some(future) = self.future.take() {
1855+
// If we haven't joined yet, signal to the coordinator that it should spawn no more
1856+
// work, and wait for worker threads to finish.
1857+
drop(self.sender.send(Box::new(Message::CodegenAborted::<B>)));
1858+
drop(future.join());
1859+
}
1860+
}
1861+
}
1862+
18311863
pub struct OngoingCodegen<B: ExtraBackendMethods> {
18321864
pub backend: B,
18331865
pub metadata: EncodedMetadata,
18341866
pub metadata_module: Option<CompiledModule>,
18351867
pub crate_info: CrateInfo,
1836-
pub coordinator_send: Sender<Box<dyn Any + Send>>,
18371868
pub codegen_worker_receive: Receiver<Message<B>>,
18381869
pub shared_emitter_main: SharedEmitterMain,
1839-
pub future: thread::JoinHandle<Result<CompiledModules, ()>>,
18401870
pub output_filenames: Arc<OutputFilenames>,
1871+
pub coordinator: Coordinator<B>,
18411872
}
18421873

18431874
impl<B: ExtraBackendMethods> OngoingCodegen<B> {
18441875
pub fn join(self, sess: &Session) -> (CodegenResults, FxHashMap<WorkProductId, WorkProduct>) {
18451876
let _timer = sess.timer("finish_ongoing_codegen");
18461877

18471878
self.shared_emitter_main.check(sess, true);
1848-
let future = self.future;
1849-
let compiled_modules = sess.time("join_worker_thread", || match future.join() {
1879+
let compiled_modules = sess.time("join_worker_thread", || match self.coordinator.join() {
18501880
Ok(Ok(compiled_modules)) => compiled_modules,
18511881
Ok(Err(())) => {
18521882
sess.abort_if_errors();
@@ -1894,26 +1924,13 @@ impl<B: ExtraBackendMethods> OngoingCodegen<B> {
18941924

18951925
// These are generally cheap and won't throw off scheduling.
18961926
let cost = 0;
1897-
submit_codegened_module_to_llvm(&self.backend, &self.coordinator_send, module, cost);
1927+
submit_codegened_module_to_llvm(&self.backend, &self.coordinator.sender, module, cost);
18981928
}
18991929

19001930
pub fn codegen_finished(&self, tcx: TyCtxt<'_>) {
19011931
self.wait_for_signal_to_codegen_item();
19021932
self.check_for_errors(tcx.sess);
1903-
drop(self.coordinator_send.send(Box::new(Message::CodegenComplete::<B>)));
1904-
}
1905-
1906-
/// Consumes this context indicating that codegen was entirely aborted, and
1907-
/// we need to exit as quickly as possible.
1908-
///
1909-
/// This method blocks the current thread until all worker threads have
1910-
/// finished, and all worker threads should have exited or be real close to
1911-
/// exiting at this point.
1912-
pub fn codegen_aborted(self) {
1913-
// Signal to the coordinator it should spawn no more work and start
1914-
// shutdown.
1915-
drop(self.coordinator_send.send(Box::new(Message::CodegenAborted::<B>)));
1916-
drop(self.future.join());
1933+
drop(self.coordinator.sender.send(Box::new(Message::CodegenComplete::<B>)));
19171934
}
19181935

19191936
pub fn check_for_errors(&self, sess: &Session) {

compiler/rustc_codegen_ssa/src/base.rs

Lines changed: 4 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ use rustc_target::abi::{Align, VariantIdx};
3939

4040
use std::collections::BTreeSet;
4141
use std::convert::TryFrom;
42-
use std::ops::{Deref, DerefMut};
4342
use std::time::{Duration, Instant};
4443

4544
use itertools::Itertools;
@@ -583,7 +582,6 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
583582
metadata_module,
584583
codegen_units.len(),
585584
);
586-
let ongoing_codegen = AbortCodegenOnDrop::<B>(Some(ongoing_codegen));
587585

588586
// Codegen an allocator shim, if necessary.
589587
//
@@ -704,7 +702,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
704702

705703
submit_codegened_module_to_llvm(
706704
&backend,
707-
&ongoing_codegen.coordinator_send,
705+
&ongoing_codegen.coordinator.sender,
708706
module,
709707
cost,
710708
);
@@ -714,7 +712,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
714712
submit_pre_lto_module_to_llvm(
715713
&backend,
716714
tcx,
717-
&ongoing_codegen.coordinator_send,
715+
&ongoing_codegen.coordinator.sender,
718716
CachedModuleCodegen {
719717
name: cgu.name().to_string(),
720718
source: cgu.previous_work_product(tcx),
@@ -725,7 +723,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
725723
CguReuse::PostLto => {
726724
submit_post_lto_module_to_llvm(
727725
&backend,
728-
&ongoing_codegen.coordinator_send,
726+
&ongoing_codegen.coordinator.sender,
729727
CachedModuleCodegen {
730728
name: cgu.name().to_string(),
731729
source: cgu.previous_work_product(tcx),
@@ -752,55 +750,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
752750
}
753751

754752
ongoing_codegen.check_for_errors(tcx.sess);
755-
756-
ongoing_codegen.into_inner()
757-
}
758-
759-
/// A curious wrapper structure whose only purpose is to call `codegen_aborted`
760-
/// when it's dropped abnormally.
761-
///
762-
/// In the process of working on rust-lang/rust#55238 a mysterious segfault was
763-
/// stumbled upon. The segfault was never reproduced locally, but it was
764-
/// suspected to be related to the fact that codegen worker threads were
765-
/// sticking around by the time the main thread was exiting, causing issues.
766-
///
767-
/// This structure is an attempt to fix that issue where the `codegen_aborted`
768-
/// message will block until all workers have finished. This should ensure that
769-
/// even if the main codegen thread panics we'll wait for pending work to
770-
/// complete before returning from the main thread, hopefully avoiding
771-
/// segfaults.
772-
///
773-
/// If you see this comment in the code, then it means that this workaround
774-
/// worked! We may yet one day track down the mysterious cause of that
775-
/// segfault...
776-
struct AbortCodegenOnDrop<B: ExtraBackendMethods>(Option<OngoingCodegen<B>>);
777-
778-
impl<B: ExtraBackendMethods> AbortCodegenOnDrop<B> {
779-
fn into_inner(mut self) -> OngoingCodegen<B> {
780-
self.0.take().unwrap()
781-
}
782-
}
783-
784-
impl<B: ExtraBackendMethods> Deref for AbortCodegenOnDrop<B> {
785-
type Target = OngoingCodegen<B>;
786-
787-
fn deref(&self) -> &OngoingCodegen<B> {
788-
self.0.as_ref().unwrap()
789-
}
790-
}
791-
792-
impl<B: ExtraBackendMethods> DerefMut for AbortCodegenOnDrop<B> {
793-
fn deref_mut(&mut self) -> &mut OngoingCodegen<B> {
794-
self.0.as_mut().unwrap()
795-
}
796-
}
797-
798-
impl<B: ExtraBackendMethods> Drop for AbortCodegenOnDrop<B> {
799-
fn drop(&mut self) {
800-
if let Some(codegen) = self.0.take() {
801-
codegen.codegen_aborted();
802-
}
803-
}
753+
ongoing_codegen
804754
}
805755

806756
impl CrateInfo {

0 commit comments

Comments
 (0)