Optimize recursive ls (#2083)

* ls: Remove allocations by eliminating collect/clones

* ls: Introduce PathData structure

- PathData will hold Path related metadata / strings that are required
frequently in subsequent functions
- All data is precomputed and cached and subsequent functions just
use cached data

* ls: Cache more data related to paths

- Cache filename and sort by filename instead of full path
- Cache uid->usr and gid->grp mappings
https://github.com/uutils/coreutils/pull/2099/files
* ls: Add BENCHMARKING.md

* ls: Document PathData structure

* tests/ls: Add testcase for error paths with width option

* ls: Fix unused import warning

cached will be only used for unix currently as current use of
caching gid/uid mappings is only relevant on unix

* ls: Suggest checking syscall count in BENCHMARKING.md

* ls: Remove mentions of sort in BENCHMARKING.md

* ls: Remove dependency on cached

Implement caching using HashMap and lazy_static

* ls: Fix MSRV error related to map_or

Rust 1.40 did not support map_or for result types
This commit is contained in:
Anup Mahindre 2021-04-22 12:49:17 +05:30 committed by GitHub
parent b756b987a4
commit 8554cdf35b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 181 additions and 73 deletions

34
src/uu/ls/BENCHMARKING.md Normal file
View file

@ -0,0 +1,34 @@
# Benchmarking ls
ls majorly involves fetching a lot of details (depending upon what details are requested, eg. time/date, inode details, etc) for each path using system calls. Ideally, any system call should be done only once for each of the paths - not adhering to this principle leads to a lot of system call overhead multiplying and bubbling up, especially for recursive ls, therefore it is important to always benchmark multiple scenarios.
This is an overwiew over what was benchmarked, and if you make changes to `ls`, you are encouraged to check
how performance was affected for the workloads listed below. Feel free to add other workloads to the
list that we should improve / make sure not to regress.
Run `cargo build --release` before benchmarking after you make a change!
## Simple recursive ls
- Get a large tree, for example linux kernel source tree.
- Benchmark simple recursive ls with hyperfine: `hyperfine --warmup 2 "target/release/coreutils ls -R tree > /dev/null"`.
## Recursive ls with all and long options
- Same tree as above
- Benchmark recursive ls with -al -R options with hyperfine: `hyperfine --warmup 2 "target/release/coreutils ls -al -R tree > /dev/null"`.
## Comparing with GNU ls
Hyperfine accepts multiple commands to run and will compare them. To compare performance with GNU ls
duplicate the string you passed to hyperfine but remove the `target/release/coreutils` bit from it.
Example: `hyperfine --warmup 2 "target/release/coreutils ls -al -R tree > /dev/null"` becomes
`hyperfine --warmup 2 "target/release/coreutils ls -al -R tree > /dev/null" "ls -al -R tree > /dev/null"`
(This assumes GNU ls is installed as `ls`)
This can also be used to compare with version of ls built before your changes to ensure your change does not regress this
## Checking system call count
- Another thing to look at would be system calls count using strace (on linux) or equivalent on other operating systems.
- Example: `strace -c target/release/coreutils ls -al -R tree`

View file

@ -1038,12 +1038,43 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
list(locs, Config::from(matches))
}
/// Represents a Path along with it's associated data
/// Any data that will be reused several times makes sense to be added to this structure
/// Caching data here helps eliminate redundant syscalls to fetch same information
struct PathData {
// Result<MetaData> got from symlink_metadata() or metadata() based on config
md: std::io::Result<Metadata>,
// String formed from get_lossy_string() for the path
lossy_string: String,
// Name of the file - will be empty for . or ..
file_name: String,
// PathBuf that all above data corresponds to
p_buf: PathBuf,
}
impl PathData {
fn new(p_buf: PathBuf, config: &Config, command_line: bool) -> Self {
let md = get_metadata(&p_buf, config, command_line);
let lossy_string = p_buf.to_string_lossy().into_owned();
let name = p_buf
.file_name()
.map_or(String::new(), |s| s.to_string_lossy().into_owned());
Self {
md,
lossy_string,
file_name: name,
p_buf,
}
}
}
fn list(locs: Vec<String>, config: Config) -> i32 {
let number_of_locs = locs.len();
let mut files = Vec::<PathBuf>::new();
let mut dirs = Vec::<PathBuf>::new();
let mut files = Vec::<PathData>::new();
let mut dirs = Vec::<PathData>::new();
let mut has_failed = false;
for loc in locs {
let p = PathBuf::from(&loc);
if !p.exists() {
@ -1054,36 +1085,28 @@ fn list(locs: Vec<String>, config: Config) -> i32 {
continue;
}
let show_dir_contents = if !config.directory {
match config.dereference {
Dereference::None => {
if let Ok(md) = p.symlink_metadata() {
md.is_dir()
} else {
show_error!("'{}': {}", &loc, "No such file or directory");
has_failed = true;
continue;
}
}
_ => p.is_dir(),
}
let path_data = PathData::new(p, &config, true);
let show_dir_contents = if let Ok(md) = path_data.md.as_ref() {
!config.directory && md.is_dir()
} else {
has_failed = true;
false
};
if show_dir_contents {
dirs.push(p);
dirs.push(path_data);
} else {
files.push(p);
files.push(path_data);
}
}
sort_entries(&mut files, &config);
display_items(&files, None, &config, true);
display_items(&files, None, &config);
sort_entries(&mut dirs, &config);
for dir in dirs {
if number_of_locs > 1 {
println!("\n{}:", dir.to_string_lossy());
println!("\n{}:", dir.lossy_string);
}
enter_directory(&dir, &config);
}
@ -1094,22 +1117,22 @@ fn list(locs: Vec<String>, config: Config) -> i32 {
}
}
fn sort_entries(entries: &mut Vec<PathBuf>, config: &Config) {
fn sort_entries(entries: &mut Vec<PathData>, config: &Config) {
match config.sort {
Sort::Time => entries.sort_by_key(|k| {
Reverse(
get_metadata(k, false)
k.md.as_ref()
.ok()
.and_then(|md| get_system_time(&md, config))
.unwrap_or(UNIX_EPOCH),
)
}),
Sort::Size => {
entries.sort_by_key(|k| Reverse(get_metadata(k, false).map(|md| md.len()).unwrap_or(0)))
entries.sort_by_key(|k| Reverse(k.md.as_ref().map(|md| md.len()).unwrap_or(0)))
}
// The default sort in GNU ls is case insensitive
Sort::Name => entries.sort_by_key(|k| k.to_string_lossy().to_lowercase()),
Sort::Version => entries.sort_by(|a, b| version_cmp::version_cmp(a, b)),
Sort::Name => entries.sort_by_key(|k| k.file_name.to_lowercase()),
Sort::Version => entries.sort_by(|k, j| version_cmp::version_cmp(&k.p_buf, &j.p_buf)),
Sort::None => {}
}
@ -1143,32 +1166,57 @@ fn should_display(entry: &DirEntry, config: &Config) -> bool {
true
}
fn enter_directory(dir: &Path, config: &Config) {
let mut entries: Vec<_> = safe_unwrap!(fs::read_dir(dir).and_then(Iterator::collect));
entries.retain(|e| should_display(e, config));
let mut entries: Vec<_> = entries.iter().map(DirEntry::path).collect();
sort_entries(&mut entries, config);
if config.files == Files::All {
let mut display_entries = entries.clone();
display_entries.insert(0, dir.join(".."));
display_entries.insert(0, dir.join("."));
display_items(&display_entries, Some(dir), config, false);
fn enter_directory(dir: &PathData, config: &Config) {
let mut entries: Vec<_> = if config.files == Files::All {
vec![
PathData::new(dir.p_buf.join("."), config, false),
PathData::new(dir.p_buf.join(".."), config, false),
]
} else {
display_items(&entries, Some(dir), config, false);
}
vec![]
};
let mut temp: Vec<_> = safe_unwrap!(fs::read_dir(&dir.p_buf))
.map(|res| safe_unwrap!(res))
.filter(|e| should_display(e, config))
.map(|e| PathData::new(DirEntry::path(&e), config, false))
.collect();
sort_entries(&mut temp, config);
entries.append(&mut temp);
display_items(&entries, Some(&dir.p_buf), config);
if config.recursive {
for e in entries.iter().filter(|p| p.is_dir()) {
println!("\n{}:", e.to_string_lossy());
for e in entries
.iter()
.skip(if config.files == Files::All { 2 } else { 0 })
.filter(|p| p.md.as_ref().map(|md| md.is_dir()).unwrap_or(false))
{
println!("\n{}:", e.lossy_string);
enter_directory(&e, config);
}
}
}
fn get_metadata(entry: &Path, dereference: bool) -> std::io::Result<Metadata> {
fn get_metadata(entry: &Path, config: &Config, command_line: bool) -> std::io::Result<Metadata> {
let dereference = match &config.dereference {
Dereference::All => true,
Dereference::Args => command_line,
Dereference::DirArgs => {
if command_line {
if let Ok(md) = entry.metadata() {
md.is_dir()
} else {
false
}
} else {
false
}
}
Dereference::None => false,
};
if dereference {
entry.metadata().or_else(|_| entry.symlink_metadata())
} else {
@ -1176,8 +1224,8 @@ fn get_metadata(entry: &Path, dereference: bool) -> std::io::Result<Metadata> {
}
}
fn display_dir_entry_size(entry: &Path, config: &Config) -> (usize, usize) {
if let Ok(md) = get_metadata(entry, false) {
fn display_dir_entry_size(entry: &PathData, config: &Config) -> (usize, usize) {
if let Ok(md) = entry.md.as_ref() {
(
display_symlink_count(&md).len(),
display_file_size(&md, config).len(),
@ -1191,7 +1239,7 @@ fn pad_left(string: String, count: usize) -> String {
format!("{:>width$}", string, width = count)
}
fn display_items(items: &[PathBuf], strip: Option<&Path>, config: &Config, command_line: bool) {
fn display_items(items: &[PathData], strip: Option<&Path>, config: &Config) {
if config.format == Format::Long {
let (mut max_links, mut max_size) = (1, 1);
for item in items {
@ -1200,18 +1248,18 @@ fn display_items(items: &[PathBuf], strip: Option<&Path>, config: &Config, comma
max_size = size.max(max_size);
}
for item in items {
display_item_long(item, strip, max_links, max_size, config, command_line);
display_item_long(item, strip, max_links, max_size, config);
}
} else {
let names = items.iter().filter_map(|i| {
let md = get_metadata(i, false);
let md = i.md.as_ref();
match md {
Err(e) => {
let filename = get_file_name(i, strip);
let filename = get_file_name(&i.p_buf, strip);
show_error!("'{}': {}", filename, e);
None
}
Ok(md) => Some(display_file_name(&i, strip, &md, config)),
Ok(md) => Some(display_file_name(&i.p_buf, strip, &md, config)),
}
});
@ -1271,33 +1319,15 @@ fn display_grid(names: impl Iterator<Item = Cell>, width: u16, direction: Direct
use uucore::fs::display_permissions;
fn display_item_long(
item: &Path,
item: &PathData,
strip: Option<&Path>,
max_links: usize,
max_size: usize,
config: &Config,
command_line: bool,
) {
let dereference = match &config.dereference {
Dereference::All => true,
Dereference::Args => command_line,
Dereference::DirArgs => {
if command_line {
if let Ok(md) = item.metadata() {
md.is_dir()
} else {
false
}
} else {
false
}
}
Dereference::None => false,
};
let md = match get_metadata(item, dereference) {
let md = match &item.md {
Err(e) => {
let filename = get_file_name(&item, strip);
let filename = get_file_name(&item.p_buf, strip);
show_error!("{}: {}", filename, e);
return;
}
@ -1336,7 +1366,7 @@ fn display_item_long(
" {} {} {}",
pad_left(display_file_size(&md, config), max_size),
display_date(&md, config),
display_file_name(&item, strip, &md, config).contents,
display_file_name(&item.p_buf, strip, &md, config).contents,
);
}
@ -1348,14 +1378,50 @@ fn get_inode(metadata: &Metadata) -> String {
// Currently getpwuid is `linux` target only. If it's broken out into
// a posix-compliant attribute this can be updated...
#[cfg(unix)]
use std::sync::Mutex;
#[cfg(unix)]
use uucore::entries;
#[cfg(unix)]
fn cached_uid2usr(uid: u32) -> String {
lazy_static! {
static ref UID_CACHE: Mutex<HashMap<u32, String>> = Mutex::new(HashMap::new());
}
let mut uid_cache = UID_CACHE.lock().unwrap();
match uid_cache.get(&uid) {
Some(usr) => usr.clone(),
None => {
let usr = entries::uid2usr(uid).unwrap_or_else(|_| uid.to_string());
uid_cache.insert(uid, usr.clone());
usr
}
}
}
#[cfg(unix)]
fn display_uname(metadata: &Metadata, config: &Config) -> String {
if config.long.numeric_uid_gid {
metadata.uid().to_string()
} else {
entries::uid2usr(metadata.uid()).unwrap_or_else(|_| metadata.uid().to_string())
cached_uid2usr(metadata.uid())
}
}
#[cfg(unix)]
fn cached_gid2grp(gid: u32) -> String {
lazy_static! {
static ref GID_CACHE: Mutex<HashMap<u32, String>> = Mutex::new(HashMap::new());
}
let mut gid_cache = GID_CACHE.lock().unwrap();
match gid_cache.get(&gid) {
Some(grp) => grp.clone(),
None => {
let grp = entries::gid2grp(gid).unwrap_or_else(|_| gid.to_string());
gid_cache.insert(gid, grp.clone());
grp
}
}
}
@ -1364,7 +1430,7 @@ fn display_group(metadata: &Metadata, config: &Config) -> String {
if config.long.numeric_uid_gid {
metadata.gid().to_string()
} else {
entries::gid2grp(metadata.gid()).unwrap_or_else(|_| metadata.gid().to_string())
cached_gid2grp(metadata.gid())
}
}

View file

@ -103,6 +103,14 @@ fn test_ls_width() {
.succeeds()
.stdout_only("test-width-1\ntest-width-2\ntest-width-3\ntest-width-4\n");
}
for option in &["-w 1a", "-w=1a", "--width=1a", "--width 1a"] {
scene
.ucmd()
.args(&option.split(" ").collect::<Vec<_>>())
.fails()
.stderr_only("ls: error: invalid line width: 1a");
}
}
#[test]