Auto merge of #122059 - nyurik:with-as-const-str, r=cuviper

Optimize write with as_const_str for shorter code

Following up on #121001

Apparently this code generates significant code block for each call to `write()` with non-simple formatting string - approx 100 lines of assembly code, possibly due to `dyn` (?).  See generated assembly code [here](https://github.com/nyurik/rust-optimize-format-str/compare/before-changes..with-my-change#diff-6b404e954c692d8cdc8c452d819a216aa5dcf40522b5944639e9ad947279a477):

<details><summary>Details</summary>
<p>

This is the inlining of `write!(buffer, "Iteration {value} was written")`

```asm
core::fmt::Write::write_fmt:
		// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 194
		fn write_fmt(&mut self, args: Arguments<'_>) -> Result {
	push r15
	push r14
	push r13
	push r12
	push rbx
	mov rdx, rsi
		// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 427
		match (self.pieces, self.args) {
	mov rcx, qword ptr [rsi + 8]
	mov rax, qword ptr [rsi + 24]
		// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 428
		([], []) => Some(""),
	cmp rcx, 1
	je .LBB0_8
	test rcx, rcx
	jne .LBB0_9
	test rax, rax
	jne .LBB0_9
		// /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 911
		self.buf.reserve(self.len, additional);
	lea r12, [rdi + 16]
	lea rsi, [rip + .L__unnamed_2]
	xor ebx, ebx
.LBB0_6:
	mov r14, qword ptr [r12]
	jmp .LBB0_7
.LBB0_8:
		// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 429
		([s], []) => Some(s),
	test rax, rax
	je .LBB0_4
.LBB0_9:
		// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 1108
		if let Some(s) = args.as_str() { output.write_str(s) } else { write_internal(output, args) }
	lea rsi, [rip + .L__unnamed_1]
	pop rbx
	pop r12
	pop r13
	pop r14
	pop r15
	jmp qword ptr [rip + core::fmt::write_internal@GOTPCREL]
.LBB0_4:
	mov rax, qword ptr [rdx]
		// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 429
		([s], []) => Some(s),
	mov rsi, qword ptr [rax]
	mov rbx, qword ptr [rax + 8]
		// /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 248
		if T::IS_ZST { usize::MAX } else { self.cap.0 }
	mov rax, qword ptr [rdi]
		// /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 911
		self.buf.reserve(self.len, additional);
	mov r14, qword ptr [rdi + 16]
		// /home/nyurik/dev/rust/rust/library/core/src/num/mod.rs : 1281
		uint_impl! {
	sub rax, r14
		// /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 392
		additional > self.capacity().wrapping_sub(len)
	cmp rax, rbx
		// /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 309
		if self.needs_to_grow(len, additional) {
	jb .LBB0_5
.LBB0_7:
	mov rax, qword ptr [rdi + 8]
		// /home/nyurik/dev/rust/rust/library/core/src/ptr/mut_ptr.rs : 1046
		unsafe { intrinsics::offset(self, count) }
	add rax, r14
	mov r15, rdi
		// /home/nyurik/dev/rust/rust/library/core/src/intrinsics.rs : 2922
		copy_nonoverlapping(src, dst, count)
	mov rdi, rax
	mov rdx, rbx
	call qword ptr [rip + memcpy@GOTPCREL]
		// /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 2040
		self.len += count;
	add r14, rbx
	mov qword ptr [r15 + 16], r14
		// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 216
		}
	xor eax, eax
	pop rbx
	pop r12
	pop r13
	pop r14
	pop r15
	ret
.LBB0_5:
		// /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 911
		self.buf.reserve(self.len, additional);
	lea r12, [rdi + 16]
	mov r15, rdi
	mov r13, rsi
		// /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 310
		do_reserve_and_handle(self, len, additional);
	mov rsi, r14
	mov rdx, rbx
	call alloc::raw_vec::RawVec<T,A>::reserve::do_reserve_and_handle
	mov rsi, r13
	mov rdi, r15
	jmp .LBB0_6
```

</p>
</details>

```rust
#[inline]
pub fn write(output: &mut dyn Write, args: Arguments<'_>) -> Result {
    if let Some(s) = args.as_str() { output.write_str(s) } else { write_internal(output, args) }
}
```

So, this brings back the older experiment - where I used `if core::intrinsics::is_val_statically_known(s.is_some()) { s } else { None }` helper function, and called it in multiple places that used `write`.  This is not as optimal because now every user of `write` must do this logic, but at least it results in significantly smaller assembly code for the formatting case, and results in identical code as now for the "simple" (no formatting) case. See [assembly comparison](https://github.com/nyurik/rust-optimize-format-str/compare/with-my-change..with-as-const-str#diff-6b404e954c692d8cdc8c452d819a216aa5dcf40522b5944639e9ad947279a477) of what is now with what this change brings (focus only on `fmt/intel-lib.txt` and `str/intel-lib.txt` files).

```rust
               if let Some(s) = args.as_const_str() {
                    self.write_str(s)
                } else {
                    write(self, args)
                }
```
This commit is contained in:
bors 2024-03-08 04:15:17 +00:00
commit 31a98cb57d

Diff content is not available