mirror of
https://github.com/uutils/coreutils
synced 2024-11-16 09:48:03 +00:00
Optimize recursive ls (#2083)
* ls: Remove allocations by eliminating collect/clones * ls: Introduce PathData structure - PathData will hold Path related metadata / strings that are required frequently in subsequent functions - All data is precomputed and cached and subsequent functions just use cached data * ls: Cache more data related to paths - Cache filename and sort by filename instead of full path - Cache uid->usr and gid->grp mappings https://github.com/uutils/coreutils/pull/2099/files * ls: Add BENCHMARKING.md * ls: Document PathData structure * tests/ls: Add testcase for error paths with width option * ls: Fix unused import warning cached will be only used for unix currently as current use of caching gid/uid mappings is only relevant on unix * ls: Suggest checking syscall count in BENCHMARKING.md * ls: Remove mentions of sort in BENCHMARKING.md * ls: Remove dependency on cached Implement caching using HashMap and lazy_static * ls: Fix MSRV error related to map_or Rust 1.40 did not support map_or for result types
This commit is contained in:
parent
b756b987a4
commit
8554cdf35b
3 changed files with 181 additions and 73 deletions
34
src/uu/ls/BENCHMARKING.md
Normal file
34
src/uu/ls/BENCHMARKING.md
Normal file
|
@ -0,0 +1,34 @@
|
|||
# Benchmarking ls
|
||||
|
||||
ls majorly involves fetching a lot of details (depending upon what details are requested, eg. time/date, inode details, etc) for each path using system calls. Ideally, any system call should be done only once for each of the paths - not adhering to this principle leads to a lot of system call overhead multiplying and bubbling up, especially for recursive ls, therefore it is important to always benchmark multiple scenarios.
|
||||
This is an overwiew over what was benchmarked, and if you make changes to `ls`, you are encouraged to check
|
||||
how performance was affected for the workloads listed below. Feel free to add other workloads to the
|
||||
list that we should improve / make sure not to regress.
|
||||
|
||||
Run `cargo build --release` before benchmarking after you make a change!
|
||||
|
||||
## Simple recursive ls
|
||||
|
||||
- Get a large tree, for example linux kernel source tree.
|
||||
- Benchmark simple recursive ls with hyperfine: `hyperfine --warmup 2 "target/release/coreutils ls -R tree > /dev/null"`.
|
||||
|
||||
## Recursive ls with all and long options
|
||||
|
||||
- Same tree as above
|
||||
- Benchmark recursive ls with -al -R options with hyperfine: `hyperfine --warmup 2 "target/release/coreutils ls -al -R tree > /dev/null"`.
|
||||
|
||||
## Comparing with GNU ls
|
||||
|
||||
Hyperfine accepts multiple commands to run and will compare them. To compare performance with GNU ls
|
||||
duplicate the string you passed to hyperfine but remove the `target/release/coreutils` bit from it.
|
||||
|
||||
Example: `hyperfine --warmup 2 "target/release/coreutils ls -al -R tree > /dev/null"` becomes
|
||||
`hyperfine --warmup 2 "target/release/coreutils ls -al -R tree > /dev/null" "ls -al -R tree > /dev/null"`
|
||||
(This assumes GNU ls is installed as `ls`)
|
||||
|
||||
This can also be used to compare with version of ls built before your changes to ensure your change does not regress this
|
||||
|
||||
## Checking system call count
|
||||
|
||||
- Another thing to look at would be system calls count using strace (on linux) or equivalent on other operating systems.
|
||||
- Example: `strace -c target/release/coreutils ls -al -R tree`
|
|
@ -1038,12 +1038,43 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
|||
list(locs, Config::from(matches))
|
||||
}
|
||||
|
||||
/// Represents a Path along with it's associated data
|
||||
/// Any data that will be reused several times makes sense to be added to this structure
|
||||
/// Caching data here helps eliminate redundant syscalls to fetch same information
|
||||
struct PathData {
|
||||
// Result<MetaData> got from symlink_metadata() or metadata() based on config
|
||||
md: std::io::Result<Metadata>,
|
||||
// String formed from get_lossy_string() for the path
|
||||
lossy_string: String,
|
||||
// Name of the file - will be empty for . or ..
|
||||
file_name: String,
|
||||
// PathBuf that all above data corresponds to
|
||||
p_buf: PathBuf,
|
||||
}
|
||||
|
||||
impl PathData {
|
||||
fn new(p_buf: PathBuf, config: &Config, command_line: bool) -> Self {
|
||||
let md = get_metadata(&p_buf, config, command_line);
|
||||
let lossy_string = p_buf.to_string_lossy().into_owned();
|
||||
let name = p_buf
|
||||
.file_name()
|
||||
.map_or(String::new(), |s| s.to_string_lossy().into_owned());
|
||||
Self {
|
||||
md,
|
||||
lossy_string,
|
||||
file_name: name,
|
||||
p_buf,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn list(locs: Vec<String>, config: Config) -> i32 {
|
||||
let number_of_locs = locs.len();
|
||||
|
||||
let mut files = Vec::<PathBuf>::new();
|
||||
let mut dirs = Vec::<PathBuf>::new();
|
||||
let mut files = Vec::<PathData>::new();
|
||||
let mut dirs = Vec::<PathData>::new();
|
||||
let mut has_failed = false;
|
||||
|
||||
for loc in locs {
|
||||
let p = PathBuf::from(&loc);
|
||||
if !p.exists() {
|
||||
|
@ -1054,36 +1085,28 @@ fn list(locs: Vec<String>, config: Config) -> i32 {
|
|||
continue;
|
||||
}
|
||||
|
||||
let show_dir_contents = if !config.directory {
|
||||
match config.dereference {
|
||||
Dereference::None => {
|
||||
if let Ok(md) = p.symlink_metadata() {
|
||||
md.is_dir()
|
||||
} else {
|
||||
show_error!("'{}': {}", &loc, "No such file or directory");
|
||||
has_failed = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
_ => p.is_dir(),
|
||||
}
|
||||
let path_data = PathData::new(p, &config, true);
|
||||
|
||||
let show_dir_contents = if let Ok(md) = path_data.md.as_ref() {
|
||||
!config.directory && md.is_dir()
|
||||
} else {
|
||||
has_failed = true;
|
||||
false
|
||||
};
|
||||
|
||||
if show_dir_contents {
|
||||
dirs.push(p);
|
||||
dirs.push(path_data);
|
||||
} else {
|
||||
files.push(p);
|
||||
files.push(path_data);
|
||||
}
|
||||
}
|
||||
sort_entries(&mut files, &config);
|
||||
display_items(&files, None, &config, true);
|
||||
display_items(&files, None, &config);
|
||||
|
||||
sort_entries(&mut dirs, &config);
|
||||
for dir in dirs {
|
||||
if number_of_locs > 1 {
|
||||
println!("\n{}:", dir.to_string_lossy());
|
||||
println!("\n{}:", dir.lossy_string);
|
||||
}
|
||||
enter_directory(&dir, &config);
|
||||
}
|
||||
|
@ -1094,22 +1117,22 @@ fn list(locs: Vec<String>, config: Config) -> i32 {
|
|||
}
|
||||
}
|
||||
|
||||
fn sort_entries(entries: &mut Vec<PathBuf>, config: &Config) {
|
||||
fn sort_entries(entries: &mut Vec<PathData>, config: &Config) {
|
||||
match config.sort {
|
||||
Sort::Time => entries.sort_by_key(|k| {
|
||||
Reverse(
|
||||
get_metadata(k, false)
|
||||
k.md.as_ref()
|
||||
.ok()
|
||||
.and_then(|md| get_system_time(&md, config))
|
||||
.unwrap_or(UNIX_EPOCH),
|
||||
)
|
||||
}),
|
||||
Sort::Size => {
|
||||
entries.sort_by_key(|k| Reverse(get_metadata(k, false).map(|md| md.len()).unwrap_or(0)))
|
||||
entries.sort_by_key(|k| Reverse(k.md.as_ref().map(|md| md.len()).unwrap_or(0)))
|
||||
}
|
||||
// The default sort in GNU ls is case insensitive
|
||||
Sort::Name => entries.sort_by_key(|k| k.to_string_lossy().to_lowercase()),
|
||||
Sort::Version => entries.sort_by(|a, b| version_cmp::version_cmp(a, b)),
|
||||
Sort::Name => entries.sort_by_key(|k| k.file_name.to_lowercase()),
|
||||
Sort::Version => entries.sort_by(|k, j| version_cmp::version_cmp(&k.p_buf, &j.p_buf)),
|
||||
Sort::None => {}
|
||||
}
|
||||
|
||||
|
@ -1143,32 +1166,57 @@ fn should_display(entry: &DirEntry, config: &Config) -> bool {
|
|||
true
|
||||
}
|
||||
|
||||
fn enter_directory(dir: &Path, config: &Config) {
|
||||
let mut entries: Vec<_> = safe_unwrap!(fs::read_dir(dir).and_then(Iterator::collect));
|
||||
|
||||
entries.retain(|e| should_display(e, config));
|
||||
|
||||
let mut entries: Vec<_> = entries.iter().map(DirEntry::path).collect();
|
||||
sort_entries(&mut entries, config);
|
||||
|
||||
if config.files == Files::All {
|
||||
let mut display_entries = entries.clone();
|
||||
display_entries.insert(0, dir.join(".."));
|
||||
display_entries.insert(0, dir.join("."));
|
||||
display_items(&display_entries, Some(dir), config, false);
|
||||
fn enter_directory(dir: &PathData, config: &Config) {
|
||||
let mut entries: Vec<_> = if config.files == Files::All {
|
||||
vec![
|
||||
PathData::new(dir.p_buf.join("."), config, false),
|
||||
PathData::new(dir.p_buf.join(".."), config, false),
|
||||
]
|
||||
} else {
|
||||
display_items(&entries, Some(dir), config, false);
|
||||
}
|
||||
vec![]
|
||||
};
|
||||
|
||||
let mut temp: Vec<_> = safe_unwrap!(fs::read_dir(&dir.p_buf))
|
||||
.map(|res| safe_unwrap!(res))
|
||||
.filter(|e| should_display(e, config))
|
||||
.map(|e| PathData::new(DirEntry::path(&e), config, false))
|
||||
.collect();
|
||||
|
||||
sort_entries(&mut temp, config);
|
||||
|
||||
entries.append(&mut temp);
|
||||
|
||||
display_items(&entries, Some(&dir.p_buf), config);
|
||||
|
||||
if config.recursive {
|
||||
for e in entries.iter().filter(|p| p.is_dir()) {
|
||||
println!("\n{}:", e.to_string_lossy());
|
||||
for e in entries
|
||||
.iter()
|
||||
.skip(if config.files == Files::All { 2 } else { 0 })
|
||||
.filter(|p| p.md.as_ref().map(|md| md.is_dir()).unwrap_or(false))
|
||||
{
|
||||
println!("\n{}:", e.lossy_string);
|
||||
enter_directory(&e, config);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_metadata(entry: &Path, dereference: bool) -> std::io::Result<Metadata> {
|
||||
fn get_metadata(entry: &Path, config: &Config, command_line: bool) -> std::io::Result<Metadata> {
|
||||
let dereference = match &config.dereference {
|
||||
Dereference::All => true,
|
||||
Dereference::Args => command_line,
|
||||
Dereference::DirArgs => {
|
||||
if command_line {
|
||||
if let Ok(md) = entry.metadata() {
|
||||
md.is_dir()
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
Dereference::None => false,
|
||||
};
|
||||
if dereference {
|
||||
entry.metadata().or_else(|_| entry.symlink_metadata())
|
||||
} else {
|
||||
|
@ -1176,8 +1224,8 @@ fn get_metadata(entry: &Path, dereference: bool) -> std::io::Result<Metadata> {
|
|||
}
|
||||
}
|
||||
|
||||
fn display_dir_entry_size(entry: &Path, config: &Config) -> (usize, usize) {
|
||||
if let Ok(md) = get_metadata(entry, false) {
|
||||
fn display_dir_entry_size(entry: &PathData, config: &Config) -> (usize, usize) {
|
||||
if let Ok(md) = entry.md.as_ref() {
|
||||
(
|
||||
display_symlink_count(&md).len(),
|
||||
display_file_size(&md, config).len(),
|
||||
|
@ -1191,7 +1239,7 @@ fn pad_left(string: String, count: usize) -> String {
|
|||
format!("{:>width$}", string, width = count)
|
||||
}
|
||||
|
||||
fn display_items(items: &[PathBuf], strip: Option<&Path>, config: &Config, command_line: bool) {
|
||||
fn display_items(items: &[PathData], strip: Option<&Path>, config: &Config) {
|
||||
if config.format == Format::Long {
|
||||
let (mut max_links, mut max_size) = (1, 1);
|
||||
for item in items {
|
||||
|
@ -1200,18 +1248,18 @@ fn display_items(items: &[PathBuf], strip: Option<&Path>, config: &Config, comma
|
|||
max_size = size.max(max_size);
|
||||
}
|
||||
for item in items {
|
||||
display_item_long(item, strip, max_links, max_size, config, command_line);
|
||||
display_item_long(item, strip, max_links, max_size, config);
|
||||
}
|
||||
} else {
|
||||
let names = items.iter().filter_map(|i| {
|
||||
let md = get_metadata(i, false);
|
||||
let md = i.md.as_ref();
|
||||
match md {
|
||||
Err(e) => {
|
||||
let filename = get_file_name(i, strip);
|
||||
let filename = get_file_name(&i.p_buf, strip);
|
||||
show_error!("'{}': {}", filename, e);
|
||||
None
|
||||
}
|
||||
Ok(md) => Some(display_file_name(&i, strip, &md, config)),
|
||||
Ok(md) => Some(display_file_name(&i.p_buf, strip, &md, config)),
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -1271,33 +1319,15 @@ fn display_grid(names: impl Iterator<Item = Cell>, width: u16, direction: Direct
|
|||
use uucore::fs::display_permissions;
|
||||
|
||||
fn display_item_long(
|
||||
item: &Path,
|
||||
item: &PathData,
|
||||
strip: Option<&Path>,
|
||||
max_links: usize,
|
||||
max_size: usize,
|
||||
config: &Config,
|
||||
command_line: bool,
|
||||
) {
|
||||
let dereference = match &config.dereference {
|
||||
Dereference::All => true,
|
||||
Dereference::Args => command_line,
|
||||
Dereference::DirArgs => {
|
||||
if command_line {
|
||||
if let Ok(md) = item.metadata() {
|
||||
md.is_dir()
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
Dereference::None => false,
|
||||
};
|
||||
|
||||
let md = match get_metadata(item, dereference) {
|
||||
let md = match &item.md {
|
||||
Err(e) => {
|
||||
let filename = get_file_name(&item, strip);
|
||||
let filename = get_file_name(&item.p_buf, strip);
|
||||
show_error!("{}: {}", filename, e);
|
||||
return;
|
||||
}
|
||||
|
@ -1336,7 +1366,7 @@ fn display_item_long(
|
|||
" {} {} {}",
|
||||
pad_left(display_file_size(&md, config), max_size),
|
||||
display_date(&md, config),
|
||||
display_file_name(&item, strip, &md, config).contents,
|
||||
display_file_name(&item.p_buf, strip, &md, config).contents,
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1348,14 +1378,50 @@ fn get_inode(metadata: &Metadata) -> String {
|
|||
// Currently getpwuid is `linux` target only. If it's broken out into
|
||||
// a posix-compliant attribute this can be updated...
|
||||
#[cfg(unix)]
|
||||
use std::sync::Mutex;
|
||||
#[cfg(unix)]
|
||||
use uucore::entries;
|
||||
|
||||
#[cfg(unix)]
|
||||
fn cached_uid2usr(uid: u32) -> String {
|
||||
lazy_static! {
|
||||
static ref UID_CACHE: Mutex<HashMap<u32, String>> = Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
let mut uid_cache = UID_CACHE.lock().unwrap();
|
||||
match uid_cache.get(&uid) {
|
||||
Some(usr) => usr.clone(),
|
||||
None => {
|
||||
let usr = entries::uid2usr(uid).unwrap_or_else(|_| uid.to_string());
|
||||
uid_cache.insert(uid, usr.clone());
|
||||
usr
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(unix)]
|
||||
fn display_uname(metadata: &Metadata, config: &Config) -> String {
|
||||
if config.long.numeric_uid_gid {
|
||||
metadata.uid().to_string()
|
||||
} else {
|
||||
entries::uid2usr(metadata.uid()).unwrap_or_else(|_| metadata.uid().to_string())
|
||||
cached_uid2usr(metadata.uid())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(unix)]
|
||||
fn cached_gid2grp(gid: u32) -> String {
|
||||
lazy_static! {
|
||||
static ref GID_CACHE: Mutex<HashMap<u32, String>> = Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
let mut gid_cache = GID_CACHE.lock().unwrap();
|
||||
match gid_cache.get(&gid) {
|
||||
Some(grp) => grp.clone(),
|
||||
None => {
|
||||
let grp = entries::gid2grp(gid).unwrap_or_else(|_| gid.to_string());
|
||||
gid_cache.insert(gid, grp.clone());
|
||||
grp
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1364,7 +1430,7 @@ fn display_group(metadata: &Metadata, config: &Config) -> String {
|
|||
if config.long.numeric_uid_gid {
|
||||
metadata.gid().to_string()
|
||||
} else {
|
||||
entries::gid2grp(metadata.gid()).unwrap_or_else(|_| metadata.gid().to_string())
|
||||
cached_gid2grp(metadata.gid())
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -103,6 +103,14 @@ fn test_ls_width() {
|
|||
.succeeds()
|
||||
.stdout_only("test-width-1\ntest-width-2\ntest-width-3\ntest-width-4\n");
|
||||
}
|
||||
|
||||
for option in &["-w 1a", "-w=1a", "--width=1a", "--width 1a"] {
|
||||
scene
|
||||
.ucmd()
|
||||
.args(&option.split(" ").collect::<Vec<_>>())
|
||||
.fails()
|
||||
.stderr_only("ls: error: invalid line width: ‘1a’");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
Loading…
Reference in a new issue