Fix locking soundness in PersistentPlugin (#12182)

# Description

There were two problems in `PersistentPlugin` which could cause a
deadlock:

1. There were two mutexes being used, and `get()` could potentially hold
both simultaneously if it had to spawn. This won't necessarily cause a
deadlock on its own, but it does mean that lock order is sensitive

2. `set_gc_config()` called `flush()` while still holding the lock,
meaning that the GC thread had to proceed before the lock was released.
However, waiting for the GC thread to proceed could mean waiting for the
GC thread to call `stop()`, which itself would try to lock the mutex.
So, it's not safe to wait for the GC thread while the lock is held. This
is fixed now.

I've also reverted #12177, as @IanManske reported that this was also
happening for him on Linux, and it seems to be this problem which should
not be platform-specific at all. I believe this solves it.

# User-Facing Changes
None

# Tests + Formatting
- 🟢 `toolkit fmt`
- 🟢 `toolkit clippy`
- 🟢 `toolkit test`
- 🟢 `toolkit test stdlib`

# After Submitting
This commit is contained in:
Devyn Cairns 2024-03-12 16:22:29 -07:00 committed by GitHub
parent cd71372ea9
commit e1cfc96ee8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 40 additions and 52 deletions

View file

@ -17,10 +17,18 @@ use super::{create_command, gc::PluginGc, make_plugin_interface, PluginInterface
pub struct PersistentPlugin { pub struct PersistentPlugin {
/// Identity (filename, shell, name) of the plugin /// Identity (filename, shell, name) of the plugin
identity: PluginIdentity, identity: PluginIdentity,
/// Mutable state
mutable: Mutex<MutableState>,
}
/// The mutable state for the persistent plugin. This should all be behind one lock to prevent lock
/// order problems.
#[derive(Debug)]
struct MutableState {
/// Reference to the plugin if running /// Reference to the plugin if running
running: Mutex<Option<RunningPlugin>>, running: Option<RunningPlugin>,
/// Garbage collector config /// Garbage collector config
gc_config: Mutex<PluginGcConfig>, gc_config: PluginGcConfig,
} }
#[derive(Debug)] #[derive(Debug)]
@ -38,8 +46,10 @@ impl PersistentPlugin {
pub fn new(identity: PluginIdentity, gc_config: PluginGcConfig) -> PersistentPlugin { pub fn new(identity: PluginIdentity, gc_config: PluginGcConfig) -> PersistentPlugin {
PersistentPlugin { PersistentPlugin {
identity, identity,
running: Mutex::new(None), mutable: Mutex::new(MutableState {
gc_config: Mutex::new(gc_config), running: None,
gc_config,
}),
} }
} }
@ -56,14 +66,14 @@ impl PersistentPlugin {
K: AsRef<OsStr>, K: AsRef<OsStr>,
V: AsRef<OsStr>, V: AsRef<OsStr>,
{ {
let mut running = self.running.lock().map_err(|_| ShellError::NushellFailed { let mut mutable = self.mutable.lock().map_err(|_| ShellError::NushellFailed {
msg: format!( msg: format!(
"plugin `{}` running mutex poisoned, probably panic during spawn", "plugin `{}` mutex poisoned, probably panic during spawn",
self.identity.name() self.identity.name()
), ),
})?; })?;
if let Some(ref running) = *running { if let Some(ref running) = mutable.running {
// It exists, so just clone the interface // It exists, so just clone the interface
Ok(running.interface.clone()) Ok(running.interface.clone())
} else { } else {
@ -71,9 +81,9 @@ impl PersistentPlugin {
// //
// We hold the lock the whole time to prevent others from trying to spawn and ending // We hold the lock the whole time to prevent others from trying to spawn and ending
// up with duplicate plugins // up with duplicate plugins
let new_running = self.clone().spawn(envs()?)?; let new_running = self.clone().spawn(envs()?, &mutable.gc_config)?;
let interface = new_running.interface.clone(); let interface = new_running.interface.clone();
*running = Some(new_running); mutable.running = Some(new_running);
Ok(interface) Ok(interface)
} }
} }
@ -82,6 +92,7 @@ impl PersistentPlugin {
fn spawn( fn spawn(
self: Arc<Self>, self: Arc<Self>,
envs: impl IntoIterator<Item = (impl AsRef<OsStr>, impl AsRef<OsStr>)>, envs: impl IntoIterator<Item = (impl AsRef<OsStr>, impl AsRef<OsStr>)>,
gc_config: &PluginGcConfig,
) -> Result<RunningPlugin, ShellError> { ) -> Result<RunningPlugin, ShellError> {
let source_file = self.identity.filename(); let source_file = self.identity.filename();
let mut plugin_cmd = create_command(source_file, self.identity.shell()); let mut plugin_cmd = create_command(source_file, self.identity.shell());
@ -111,14 +122,7 @@ impl PersistentPlugin {
})?; })?;
// Start the plugin garbage collector // Start the plugin garbage collector
let gc_config = let gc = PluginGc::new(gc_config.clone(), &self)?;
self.gc_config
.lock()
.map(|c| c.clone())
.map_err(|_| ShellError::NushellFailed {
msg: "plugin gc mutex poisoned".into(),
})?;
let gc = PluginGc::new(gc_config, &self)?;
let pid = child.id(); let pid = child.id();
let interface = let interface =
@ -136,47 +140,51 @@ impl RegisteredPlugin for PersistentPlugin {
fn is_running(&self) -> bool { fn is_running(&self) -> bool {
// If the lock is poisoned, we return false here. That may not be correct, but this is a // If the lock is poisoned, we return false here. That may not be correct, but this is a
// failure state anyway that would be noticed at some point // failure state anyway that would be noticed at some point
self.running.lock().map(|r| r.is_some()).unwrap_or(false) self.mutable
.lock()
.map(|m| m.running.is_some())
.unwrap_or(false)
} }
fn pid(&self) -> Option<u32> { fn pid(&self) -> Option<u32> {
// Again, we return None for a poisoned lock. // Again, we return None for a poisoned lock.
self.running self.mutable
.lock() .lock()
.ok() .ok()
.and_then(|r| r.as_ref().map(|r| r.pid)) .and_then(|r| r.running.as_ref().map(|r| r.pid))
} }
fn stop(&self) -> Result<(), ShellError> { fn stop(&self) -> Result<(), ShellError> {
let mut running = self.running.lock().map_err(|_| ShellError::NushellFailed { let mut mutable = self.mutable.lock().map_err(|_| ShellError::NushellFailed {
msg: format!( msg: format!(
"plugin `{}` running mutex poisoned, probably panic during spawn", "plugin `{}` mutable mutex poisoned, probably panic during spawn",
self.identity.name() self.identity.name()
), ),
})?; })?;
// If the plugin is running, stop its GC, so that the GC doesn't accidentally try to stop // If the plugin is running, stop its GC, so that the GC doesn't accidentally try to stop
// a future plugin // a future plugin
if let Some(running) = running.as_ref() { if let Some(ref running) = mutable.running {
running.gc.stop_tracking(); running.gc.stop_tracking();
} }
// We don't try to kill the process or anything, we just drop the RunningPlugin. It should // We don't try to kill the process or anything, we just drop the RunningPlugin. It should
// exit soon after // exit soon after
*running = None; mutable.running = None;
Ok(()) Ok(())
} }
fn set_gc_config(&self, gc_config: &PluginGcConfig) { fn set_gc_config(&self, gc_config: &PluginGcConfig) {
if let Ok(mut conf) = self.gc_config.lock() { if let Ok(mut mutable) = self.mutable.lock() {
// Save the new config for future calls // Save the new config for future calls
*conf = gc_config.clone(); mutable.gc_config = gc_config.clone();
}
if let Ok(running) = self.running.lock() { // If the plugin is already running, propagate the config change to the running GC
if let Some(running) = running.as_ref() { if let Some(gc) = mutable.running.as_ref().map(|running| running.gc.clone()) {
// If the plugin is already running, propagate the config change to the running GC // We don't want to get caught holding the lock
running.gc.set_config(gc_config.clone()); drop(mutable);
running.gc.flush(); gc.set_config(gc_config.clone());
gc.flush();
} }
} }
} }

View file

@ -193,10 +193,6 @@ fn custom_values_can_still_be_collapsed_after_stop() {
} }
#[test] #[test]
#[cfg_attr(
target_os = "macos",
ignore = "Plugin GC tests are disabled temporarily on macOS because they get stuck"
)]
fn plugin_gc_can_be_configured_to_stop_plugins_immediately() { fn plugin_gc_can_be_configured_to_stop_plugins_immediately() {
// I know the test is to stop "immediately", but if we actually check immediately it could // I know the test is to stop "immediately", but if we actually check immediately it could
// lead to a race condition. So there's a 1ms sleep just to be fair. // lead to a race condition. So there's a 1ms sleep just to be fair.
@ -232,10 +228,6 @@ fn plugin_gc_can_be_configured_to_stop_plugins_immediately() {
} }
#[test] #[test]
#[cfg_attr(
target_os = "macos",
ignore = "Plugin GC tests are disabled temporarily on macOS because they get stuck"
)]
fn plugin_gc_can_be_configured_to_stop_plugins_after_delay() { fn plugin_gc_can_be_configured_to_stop_plugins_after_delay() {
let out = nu_with_plugins!( let out = nu_with_plugins!(
cwd: ".", cwd: ".",
@ -293,10 +285,6 @@ fn plugin_gc_can_be_configured_to_stop_plugins_after_delay() {
} }
#[test] #[test]
#[cfg_attr(
target_os = "macos",
ignore = "Plugin GC tests are disabled temporarily on macOS because they get stuck"
)]
fn plugin_gc_can_be_configured_as_disabled() { fn plugin_gc_can_be_configured_as_disabled() {
let out = nu_with_plugins!( let out = nu_with_plugins!(
cwd: ".", cwd: ".",
@ -329,10 +317,6 @@ fn plugin_gc_can_be_configured_as_disabled() {
} }
#[test] #[test]
#[cfg_attr(
target_os = "macos",
ignore = "Plugin GC tests are disabled temporarily on macOS because they get stuck"
)]
fn plugin_gc_can_be_disabled_by_plugin() { fn plugin_gc_can_be_disabled_by_plugin() {
let out = nu_with_plugins!( let out = nu_with_plugins!(
cwd: ".", cwd: ".",
@ -350,10 +334,6 @@ fn plugin_gc_can_be_disabled_by_plugin() {
} }
#[test] #[test]
#[cfg_attr(
target_os = "macos",
ignore = "Plugin GC tests are disabled temporarily on macOS because they get stuck"
)]
fn plugin_gc_does_not_stop_plugin_while_stream_output_is_active() { fn plugin_gc_does_not_stop_plugin_while_stream_output_is_active() {
let out = nu_with_plugins!( let out = nu_with_plugins!(
cwd: ".", cwd: ".",