Fix locking soundness in PersistentPlugin (#12182)

# Description

There were two problems in `PersistentPlugin` which could cause a
deadlock:

1. There were two mutexes being used, and `get()` could potentially hold
both simultaneously if it had to spawn. This won't necessarily cause a
deadlock on its own, but it does mean that lock order is sensitive

2. `set_gc_config()` called `flush()` while still holding the lock,
meaning that the GC thread had to proceed before the lock was released.
However, waiting for the GC thread to proceed could mean waiting for the
GC thread to call `stop()`, which itself would try to lock the mutex.
So, it's not safe to wait for the GC thread while the lock is held. This
is fixed now.

I've also reverted #12177, as @IanManske reported that this was also
happening for him on Linux, and it seems to be this problem which should
not be platform-specific at all. I believe this solves it.

# User-Facing Changes
None

# Tests + Formatting
- 🟢 `toolkit fmt`
- 🟢 `toolkit clippy`
- 🟢 `toolkit test`
- 🟢 `toolkit test stdlib`

# After Submitting
This commit is contained in:
Devyn Cairns 2024-03-12 16:22:29 -07:00 committed by GitHub
parent cd71372ea9
commit e1cfc96ee8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 40 additions and 52 deletions

View file

@ -17,10 +17,18 @@ use super::{create_command, gc::PluginGc, make_plugin_interface, PluginInterface
pub struct PersistentPlugin {
/// Identity (filename, shell, name) of the plugin
identity: PluginIdentity,
/// Mutable state
mutable: Mutex<MutableState>,
}
/// The mutable state for the persistent plugin. This should all be behind one lock to prevent lock
/// order problems.
#[derive(Debug)]
struct MutableState {
/// Reference to the plugin if running
running: Mutex<Option<RunningPlugin>>,
running: Option<RunningPlugin>,
/// Garbage collector config
gc_config: Mutex<PluginGcConfig>,
gc_config: PluginGcConfig,
}
#[derive(Debug)]
@ -38,8 +46,10 @@ impl PersistentPlugin {
pub fn new(identity: PluginIdentity, gc_config: PluginGcConfig) -> PersistentPlugin {
PersistentPlugin {
identity,
running: Mutex::new(None),
gc_config: Mutex::new(gc_config),
mutable: Mutex::new(MutableState {
running: None,
gc_config,
}),
}
}
@ -56,14 +66,14 @@ impl PersistentPlugin {
K: AsRef<OsStr>,
V: AsRef<OsStr>,
{
let mut running = self.running.lock().map_err(|_| ShellError::NushellFailed {
let mut mutable = self.mutable.lock().map_err(|_| ShellError::NushellFailed {
msg: format!(
"plugin `{}` running mutex poisoned, probably panic during spawn",
"plugin `{}` mutex poisoned, probably panic during spawn",
self.identity.name()
),
})?;
if let Some(ref running) = *running {
if let Some(ref running) = mutable.running {
// It exists, so just clone the interface
Ok(running.interface.clone())
} else {
@ -71,9 +81,9 @@ impl PersistentPlugin {
//
// We hold the lock the whole time to prevent others from trying to spawn and ending
// up with duplicate plugins
let new_running = self.clone().spawn(envs()?)?;
let new_running = self.clone().spawn(envs()?, &mutable.gc_config)?;
let interface = new_running.interface.clone();
*running = Some(new_running);
mutable.running = Some(new_running);
Ok(interface)
}
}
@ -82,6 +92,7 @@ impl PersistentPlugin {
fn spawn(
self: Arc<Self>,
envs: impl IntoIterator<Item = (impl AsRef<OsStr>, impl AsRef<OsStr>)>,
gc_config: &PluginGcConfig,
) -> Result<RunningPlugin, ShellError> {
let source_file = self.identity.filename();
let mut plugin_cmd = create_command(source_file, self.identity.shell());
@ -111,14 +122,7 @@ impl PersistentPlugin {
})?;
// Start the plugin garbage collector
let gc_config =
self.gc_config
.lock()
.map(|c| c.clone())
.map_err(|_| ShellError::NushellFailed {
msg: "plugin gc mutex poisoned".into(),
})?;
let gc = PluginGc::new(gc_config, &self)?;
let gc = PluginGc::new(gc_config.clone(), &self)?;
let pid = child.id();
let interface =
@ -136,47 +140,51 @@ impl RegisteredPlugin for PersistentPlugin {
fn is_running(&self) -> bool {
// If the lock is poisoned, we return false here. That may not be correct, but this is a
// failure state anyway that would be noticed at some point
self.running.lock().map(|r| r.is_some()).unwrap_or(false)
self.mutable
.lock()
.map(|m| m.running.is_some())
.unwrap_or(false)
}
fn pid(&self) -> Option<u32> {
// Again, we return None for a poisoned lock.
self.running
self.mutable
.lock()
.ok()
.and_then(|r| r.as_ref().map(|r| r.pid))
.and_then(|r| r.running.as_ref().map(|r| r.pid))
}
fn stop(&self) -> Result<(), ShellError> {
let mut running = self.running.lock().map_err(|_| ShellError::NushellFailed {
let mut mutable = self.mutable.lock().map_err(|_| ShellError::NushellFailed {
msg: format!(
"plugin `{}` running mutex poisoned, probably panic during spawn",
"plugin `{}` mutable mutex poisoned, probably panic during spawn",
self.identity.name()
),
})?;
// If the plugin is running, stop its GC, so that the GC doesn't accidentally try to stop
// a future plugin
if let Some(running) = running.as_ref() {
if let Some(ref running) = mutable.running {
running.gc.stop_tracking();
}
// We don't try to kill the process or anything, we just drop the RunningPlugin. It should
// exit soon after
*running = None;
mutable.running = None;
Ok(())
}
fn set_gc_config(&self, gc_config: &PluginGcConfig) {
if let Ok(mut conf) = self.gc_config.lock() {
if let Ok(mut mutable) = self.mutable.lock() {
// Save the new config for future calls
*conf = gc_config.clone();
}
if let Ok(running) = self.running.lock() {
if let Some(running) = running.as_ref() {
// If the plugin is already running, propagate the config change to the running GC
running.gc.set_config(gc_config.clone());
running.gc.flush();
mutable.gc_config = gc_config.clone();
// If the plugin is already running, propagate the config change to the running GC
if let Some(gc) = mutable.running.as_ref().map(|running| running.gc.clone()) {
// We don't want to get caught holding the lock
drop(mutable);
gc.set_config(gc_config.clone());
gc.flush();
}
}
}

View file

@ -193,10 +193,6 @@ fn custom_values_can_still_be_collapsed_after_stop() {
}
#[test]
#[cfg_attr(
target_os = "macos",
ignore = "Plugin GC tests are disabled temporarily on macOS because they get stuck"
)]
fn plugin_gc_can_be_configured_to_stop_plugins_immediately() {
// I know the test is to stop "immediately", but if we actually check immediately it could
// lead to a race condition. So there's a 1ms sleep just to be fair.
@ -232,10 +228,6 @@ fn plugin_gc_can_be_configured_to_stop_plugins_immediately() {
}
#[test]
#[cfg_attr(
target_os = "macos",
ignore = "Plugin GC tests are disabled temporarily on macOS because they get stuck"
)]
fn plugin_gc_can_be_configured_to_stop_plugins_after_delay() {
let out = nu_with_plugins!(
cwd: ".",
@ -293,10 +285,6 @@ fn plugin_gc_can_be_configured_to_stop_plugins_after_delay() {
}
#[test]
#[cfg_attr(
target_os = "macos",
ignore = "Plugin GC tests are disabled temporarily on macOS because they get stuck"
)]
fn plugin_gc_can_be_configured_as_disabled() {
let out = nu_with_plugins!(
cwd: ".",
@ -329,10 +317,6 @@ fn plugin_gc_can_be_configured_as_disabled() {
}
#[test]
#[cfg_attr(
target_os = "macos",
ignore = "Plugin GC tests are disabled temporarily on macOS because they get stuck"
)]
fn plugin_gc_can_be_disabled_by_plugin() {
let out = nu_with_plugins!(
cwd: ".",
@ -350,10 +334,6 @@ fn plugin_gc_can_be_disabled_by_plugin() {
}
#[test]
#[cfg_attr(
target_os = "macos",
ignore = "Plugin GC tests are disabled temporarily on macOS because they get stuck"
)]
fn plugin_gc_does_not_stop_plugin_while_stream_output_is_active() {
let out = nu_with_plugins!(
cwd: ".",