bevy/benches
James Liu ec8c8fbc8a Remove unnecesary branches/panics from Query accesses (#6461)
# Objective
Supercedes #6452. Upon inspection of the [generated assembly](https://gist.github.com/james7132/c2740c6941b80d7912f1e8888e223cbb#file-original-s) of a [simple Bevy binary](https://gist.github.com/james7132/c2740c6941b80d7912f1e8888e223cbb#file-source-rs) compiled with `cargo rustc --release -- --emit asm`, it's apparent that there are multiple unnecessary branches in the generated assembly:

```assembly
.LBB5_5:
	cmpq	%r10, %r11
	je	.LBB5_15
	movq	(%r11), %rcx
	movq	328(%r15), %rdx
	cmpq	%rdx, %rcx
	jae	.LBB5_14
	movq	312(%r15), %rdi
	leaq	(%rcx,%rcx,2), %rcx
	shlq	$5, %rcx
	movq	336(%r12), %rdx
	movq	64(%rdi,%rcx), %rax
	cmpq	%rdx, %rax
	jbe	.LBB5_4
	leaq	(%rdi,%rcx), %rsi
	movq	48(%rsi), %rbp
	shlq	$4, %rdx
	cmpq	$0, (%rbp,%rdx)
	je	.LBB5_4
	movq	344(%r12), %rbx
	cmpq	%rbx, %rax
	jbe	.LBB5_4
	shlq	$4, %rbx
	cmpq	$0, (%rbp,%rbx)
	je	.LBB5_4
	addq	$8, %r11
	movq	88(%rdi,%rcx), %rcx
	testq	%rcx, %rcx
	je	.LBB5_5
	movq	(%rsi), %rax
	movq	8(%rbp,%rdx), %rdx
	leaq	(%rdx,%rdx,4), %rdi
	shlq	$4, %rdi
	movq	32(%rax,%rdi), %rdx
	movq	56(%rax,%rdi), %r8
	movq	8(%rbp,%rbx), %rbp
	leaq	(%rbp,%rbp,4), %rbp
	shlq	$4, %rbp
	movq	32(%rax,%rbp), %r9
	xorl	%ebp, %ebp
	jmp	.LBB5_13
	.p2align	4, 0x90
```

Almost every one of the instructions starting with `j` is a potential branch, which can significantly slow down accesses. Of these, two labels are both common and never used:

```asm
.LBB5_14:
	leaq	__unnamed_2(%rip), %r8
	callq	_ZN4core9panicking18panic_bounds_check17h70367088e72af65aE
	ud2
.LBB5_4:
	callq	_ZN8bevy_ecs5query25debug_checked_unreachable17h0855ff520ceaea77E
	ud2
	.seh_endproc
```

These correpsond to subprocedure calls to panicking due to out of bounds from indexing `Tables` and `debug_checked_unreadable`. Both of which should be inlined and optimized out, but are not.

## Solution
Make `debug_checked_unreachable` a macro to forcibly inline either `unreachable!()` in debug builds, and `std::hint::unreachable_unchecked()` in release mode. Replace the `Tables` and `Archetype` index access with `get(id).unwrap_or_else(|| debug_checked_unreachable!())` to assume that the table or archetype provided exists.

This has no external breaking change of any kind.

The equivalent section of code with these changes removes most of the conditional jump instructions:

```asm
.LBB5_5:
	movss	(%rbx,%rbp,4), %xmm0
	movl	%r14d, 4(%r8,%rbp,8)
	addss	(%rdi,%rbp,4), %xmm0
	movss	%xmm0, (%rdi,%rbp,4)
	incq	%rbp
.LBB5_1:
	cmpq	%rdx, %rbp
	jne	.LBB5_5
	.p2align	4, 0x90
.LBB5_2:
	cmpq	%rcx, %rax
	je	.LBB5_6
	movq	(%rax), %rdx
	addq	$8, %rax
	movq	312(%rsi), %rbp
	leaq	(%rdx,%rdx,2), %rbx
	shlq	$5, %rbx
	movq	88(%rbp,%rbx), %rdx
	testq	%rdx, %rdx
	je	.LBB5_2
	leaq	(%rbx,%rbp), %r8
	movq	336(%r15), %rdi
	movq	344(%r15), %r9
	movq	48(%rbp,%rbx), %r10
	shlq	$4, %rdi
	movq	(%r8), %rbx
	movq	8(%r10,%rdi), %rdi
	leaq	(%rdi,%rdi,4), %rbp
	shlq	$4, %rbp
	movq	32(%rbx,%rbp), %rdi
	movq	56(%rbx,%rbp), %r8
	shlq	$4, %r9
	movq	8(%r10,%r9), %rbp
	leaq	(%rbp,%rbp,4), %rbp
	shlq	$4, %rbp
	movq	32(%rbx,%rbp), %rbx
	xorl	%ebp, %ebp
	jmp	.LBB5_5
.LBB5_6:
	addq	$40, %rsp
	popq	%rbx
	popq	%rbp
	popq	%rdi
	popq	%rsi
	popq	%r14
	popq	%r15
	retq
	.seh_endproc

```

## Performance

Microbenchmarks results:

<details>

```
group                                                    main                                     no-panic-query
-----                                                    ----                                     --------------
busy_systems/01x_entities_03_systems                     1.20     42.4±2.66µs        ? ?/sec      1.00     35.3±1.68µs        ? ?/sec
busy_systems/01x_entities_06_systems                     1.32     83.8±3.50µs        ? ?/sec      1.00     63.6±1.72µs        ? ?/sec
busy_systems/01x_entities_09_systems                     1.15    113.3±8.90µs        ? ?/sec      1.00     98.2±6.15µs        ? ?/sec
busy_systems/01x_entities_12_systems                     1.27   160.8±32.44µs        ? ?/sec      1.00    126.6±4.70µs        ? ?/sec
busy_systems/01x_entities_15_systems                     1.12    179.6±3.71µs        ? ?/sec      1.00   160.3±11.03µs        ? ?/sec
busy_systems/02x_entities_03_systems                     1.18     76.8±3.14µs        ? ?/sec      1.00     65.2±3.17µs        ? ?/sec
busy_systems/02x_entities_06_systems                     1.16    144.6±6.10µs        ? ?/sec      1.00    124.5±5.14µs        ? ?/sec
busy_systems/02x_entities_09_systems                     1.19    215.3±9.18µs        ? ?/sec      1.00    181.5±5.67µs        ? ?/sec
busy_systems/02x_entities_12_systems                     1.20    266.7±8.33µs        ? ?/sec      1.00    222.0±9.53µs        ? ?/sec
busy_systems/02x_entities_15_systems                     1.23   338.8±10.53µs        ? ?/sec      1.00    276.3±6.94µs        ? ?/sec
busy_systems/03x_entities_03_systems                     1.43    113.5±5.06µs        ? ?/sec      1.00     79.6±1.49µs        ? ?/sec
busy_systems/03x_entities_06_systems                     1.38   217.3±12.67µs        ? ?/sec      1.00    157.5±3.07µs        ? ?/sec
busy_systems/03x_entities_09_systems                     1.23   308.8±24.75µs        ? ?/sec      1.00    251.6±8.93µs        ? ?/sec
busy_systems/03x_entities_12_systems                     1.05   347.7±12.43µs        ? ?/sec      1.00   330.6±11.43µs        ? ?/sec
busy_systems/03x_entities_15_systems                     1.13   455.5±13.88µs        ? ?/sec      1.00   401.7±17.29µs        ? ?/sec
busy_systems/04x_entities_03_systems                     1.24    144.7±5.89µs        ? ?/sec      1.00    116.9±6.29µs        ? ?/sec
busy_systems/04x_entities_06_systems                     1.24   282.8±21.40µs        ? ?/sec      1.00   228.6±21.31µs        ? ?/sec
busy_systems/04x_entities_09_systems                     1.35   431.8±14.10µs        ? ?/sec      1.00    319.6±9.83µs        ? ?/sec
busy_systems/04x_entities_12_systems                     1.16   493.8±22.87µs        ? ?/sec      1.00   424.9±15.24µs        ? ?/sec
busy_systems/04x_entities_15_systems                     1.10   587.5±23.25µs        ? ?/sec      1.00   531.7±16.32µs        ? ?/sec
busy_systems/05x_entities_03_systems                     1.14    148.2±9.61µs        ? ?/sec      1.00    129.5±4.32µs        ? ?/sec
busy_systems/05x_entities_06_systems                     1.31   359.7±17.46µs        ? ?/sec      1.00   273.6±10.55µs        ? ?/sec
busy_systems/05x_entities_09_systems                     1.22   473.5±23.11µs        ? ?/sec      1.00   389.3±13.62µs        ? ?/sec
busy_systems/05x_entities_12_systems                     1.05   562.9±20.76µs        ? ?/sec      1.00   536.5±24.35µs        ? ?/sec
busy_systems/05x_entities_15_systems                     1.23   818.5±28.70µs        ? ?/sec      1.00   666.6±45.87µs        ? ?/sec
contrived/01x_entities_03_systems                        1.27     27.5±0.49µs        ? ?/sec      1.00     21.6±1.71µs        ? ?/sec
contrived/01x_entities_06_systems                        1.22     49.9±1.18µs        ? ?/sec      1.00     40.7±2.62µs        ? ?/sec
contrived/01x_entities_09_systems                        1.30     72.3±2.39µs        ? ?/sec      1.00     55.4±2.60µs        ? ?/sec
contrived/01x_entities_12_systems                        1.28     94.3±9.44µs        ? ?/sec      1.00     73.7±3.62µs        ? ?/sec
contrived/01x_entities_15_systems                        1.25    118.0±2.43µs        ? ?/sec      1.00     94.1±3.99µs        ? ?/sec
contrived/02x_entities_03_systems                        1.23     41.6±1.71µs        ? ?/sec      1.00     33.7±2.30µs        ? ?/sec
contrived/02x_entities_06_systems                        1.19     78.6±2.63µs        ? ?/sec      1.00     65.9±2.35µs        ? ?/sec
contrived/02x_entities_09_systems                        1.28    113.6±3.60µs        ? ?/sec      1.00     88.6±3.60µs        ? ?/sec
contrived/02x_entities_12_systems                        1.20    146.4±5.75µs        ? ?/sec      1.00    121.7±3.35µs        ? ?/sec
contrived/02x_entities_15_systems                        1.23    178.5±4.86µs        ? ?/sec      1.00    145.7±4.00µs        ? ?/sec
contrived/03x_entities_03_systems                        1.42     58.3±2.77µs        ? ?/sec      1.00     41.1±1.54µs        ? ?/sec
contrived/03x_entities_06_systems                        1.32    108.5±7.30µs        ? ?/sec      1.00     82.4±4.86µs        ? ?/sec
contrived/03x_entities_09_systems                        1.23    153.7±4.61µs        ? ?/sec      1.00    125.0±4.76µs        ? ?/sec
contrived/03x_entities_12_systems                        1.18    197.5±5.12µs        ? ?/sec      1.00    166.8±8.14µs        ? ?/sec
contrived/03x_entities_15_systems                        1.23    238.8±6.38µs        ? ?/sec      1.00    194.6±4.55µs        ? ?/sec
contrived/04x_entities_03_systems                        1.34     66.4±3.42µs        ? ?/sec      1.00     49.5±1.98µs        ? ?/sec
contrived/04x_entities_06_systems                        1.27    134.3±4.86µs        ? ?/sec      1.00    105.8±3.58µs        ? ?/sec
contrived/04x_entities_09_systems                        1.26    193.2±3.83µs        ? ?/sec      1.00    153.0±5.60µs        ? ?/sec
contrived/04x_entities_12_systems                        1.16    237.1±5.78µs        ? ?/sec      1.00   204.9±18.77µs        ? ?/sec
contrived/04x_entities_15_systems                        1.17    289.2±4.76µs        ? ?/sec      1.00    246.3±8.57µs        ? ?/sec
contrived/05x_entities_03_systems                        1.26     80.4±2.90µs        ? ?/sec      1.00     63.7±3.07µs        ? ?/sec
contrived/05x_entities_06_systems                        1.27   161.6±13.47µs        ? ?/sec      1.00    127.2±5.59µs        ? ?/sec
contrived/05x_entities_09_systems                        1.22    228.0±7.76µs        ? ?/sec      1.00    186.2±7.68µs        ? ?/sec
contrived/05x_entities_12_systems                        1.20    289.5±6.21µs        ? ?/sec      1.00    241.8±7.52µs        ? ?/sec
contrived/05x_entities_15_systems                        1.18   357.3±11.24µs        ? ?/sec      1.00    302.7±7.21µs        ? ?/sec
heavy_compute/base                                       1.01    302.4±3.52µs        ? ?/sec      1.00    300.2±3.40µs        ? ?/sec
iter_fragmented/base                                     1.00    348.1±7.51ns        ? ?/sec      1.01    351.9±8.32ns        ? ?/sec
iter_fragmented/foreach                                  1.03   239.8±23.78ns        ? ?/sec      1.00   233.8±18.12ns        ? ?/sec
iter_fragmented/foreach_wide                             1.00      3.9±0.13µs        ? ?/sec      1.02      4.0±0.22µs        ? ?/sec
iter_fragmented/wide                                     1.18      4.6±0.15µs        ? ?/sec      1.00      3.9±0.10µs        ? ?/sec
iter_fragmented_sparse/base                              1.02      8.1±0.15ns        ? ?/sec      1.00      7.9±0.56ns        ? ?/sec
iter_fragmented_sparse/foreach                           1.00      7.8±0.22ns        ? ?/sec      1.01      7.9±0.62ns        ? ?/sec
iter_fragmented_sparse/foreach_wide                      1.00     37.2±1.17ns        ? ?/sec      1.10     40.9±0.95ns        ? ?/sec
iter_fragmented_sparse/wide                              1.09     48.4±2.13ns        ? ?/sec      1.00    44.5±18.34ns        ? ?/sec
iter_simple/base                                         1.02      8.4±0.10µs        ? ?/sec      1.00      8.2±0.14µs        ? ?/sec
iter_simple/foreach                                      1.01      8.3±0.07µs        ? ?/sec      1.00      8.2±0.09µs        ? ?/sec
iter_simple/foreach_sparse_set                           1.00     25.3±0.32µs        ? ?/sec      1.02     25.7±0.42µs        ? ?/sec
iter_simple/foreach_wide                                 1.03     41.1±0.94µs        ? ?/sec      1.00     39.9±0.41µs        ? ?/sec
iter_simple/foreach_wide_sparse_set                      1.05    123.6±2.05µs        ? ?/sec      1.00    118.1±2.78µs        ? ?/sec
iter_simple/sparse_set                                   1.14     30.5±1.40µs        ? ?/sec      1.00     26.9±0.64µs        ? ?/sec
iter_simple/system                                       1.01      8.4±0.25µs        ? ?/sec      1.00      8.4±0.11µs        ? ?/sec
iter_simple/wide                                         1.18     48.2±0.62µs        ? ?/sec      1.00     40.7±0.38µs        ? ?/sec
iter_simple/wide_sparse_set                              1.12   140.8±21.56µs        ? ?/sec      1.00    126.0±2.30µs        ? ?/sec
query_get/50000_entities_sparse                          1.17    378.6±7.60µs        ? ?/sec      1.00   324.1±23.17µs        ? ?/sec
query_get/50000_entities_table                           1.08   330.9±10.90µs        ? ?/sec      1.00    306.8±4.98µs        ? ?/sec
query_get_component/50000_entities_sparse                1.00   976.7±19.55µs        ? ?/sec      1.00   979.8±35.87µs        ? ?/sec
query_get_component/50000_entities_table                 1.00  1029.0±15.11µs        ? ?/sec      1.05  1080.0±59.18µs        ? ?/sec
query_get_component_simple/system                        1.13   839.7±14.18µs        ? ?/sec      1.00   742.8±10.72µs        ? ?/sec
query_get_component_simple/unchecked                     1.01   909.0±15.17µs        ? ?/sec      1.00   898.0±13.56µs        ? ?/sec
query_get_many_10/50000_calls_sparse                     1.04      5.5±0.54ms        ? ?/sec      1.00      5.3±0.67ms        ? ?/sec
query_get_many_10/50000_calls_table                      1.01      4.9±0.49ms        ? ?/sec      1.00      4.8±0.45ms        ? ?/sec
query_get_many_2/50000_calls_sparse                      1.28  848.4±210.89µs        ? ?/sec      1.00   664.8±47.69µs        ? ?/sec
query_get_many_2/50000_calls_table                       1.05   779.0±73.85µs        ? ?/sec      1.00   739.2±83.02µs        ? ?/sec
query_get_many_5/50000_calls_sparse                      1.05      2.4±0.37ms        ? ?/sec      1.00      2.3±0.33ms        ? ?/sec
query_get_many_5/50000_calls_table                       1.00  1939.9±75.22µs        ? ?/sec      1.04      2.0±0.19ms        ? ?/sec
run_criteria/yes_using_query/001_systems                 1.00      3.7±0.38µs        ? ?/sec      1.30      4.9±0.14µs        ? ?/sec
run_criteria/yes_using_query/006_systems                 1.00      8.9±0.40µs        ? ?/sec      1.17     10.3±0.57µs        ? ?/sec
run_criteria/yes_using_query/011_systems                 1.00     13.9±0.49µs        ? ?/sec      1.08     15.0±0.89µs        ? ?/sec
run_criteria/yes_using_query/016_systems                 1.00     18.8±0.74µs        ? ?/sec      1.00     18.8±1.43µs        ? ?/sec
run_criteria/yes_using_query/021_systems                 1.07     24.1±0.87µs        ? ?/sec      1.00     22.6±1.58µs        ? ?/sec
run_criteria/yes_using_query/026_systems                 1.04     27.9±0.62µs        ? ?/sec      1.00     26.8±1.71µs        ? ?/sec
run_criteria/yes_using_query/031_systems                 1.09     33.3±1.03µs        ? ?/sec      1.00     30.5±2.18µs        ? ?/sec
run_criteria/yes_using_query/036_systems                 1.14     38.7±0.80µs        ? ?/sec      1.00     33.9±1.75µs        ? ?/sec
run_criteria/yes_using_query/041_systems                 1.18     43.7±1.07µs        ? ?/sec      1.00     37.0±2.39µs        ? ?/sec
run_criteria/yes_using_query/046_systems                 1.14     47.6±1.16µs        ? ?/sec      1.00     41.9±2.09µs        ? ?/sec
run_criteria/yes_using_query/051_systems                 1.17     52.9±2.04µs        ? ?/sec      1.00     45.3±1.75µs        ? ?/sec
run_criteria/yes_using_query/056_systems                 1.25     59.2±2.38µs        ? ?/sec      1.00     47.2±2.01µs        ? ?/sec
run_criteria/yes_using_query/061_systems                 1.28    66.1±15.84µs        ? ?/sec      1.00     51.5±2.47µs        ? ?/sec
run_criteria/yes_using_query/066_systems                 1.28     70.2±2.57µs        ? ?/sec      1.00     54.7±2.58µs        ? ?/sec
run_criteria/yes_using_query/071_systems                 1.30     75.5±2.27µs        ? ?/sec      1.00     58.2±3.31µs        ? ?/sec
run_criteria/yes_using_query/076_systems                 1.26     81.5±2.66µs        ? ?/sec      1.00     64.5±3.13µs        ? ?/sec
run_criteria/yes_using_query/081_systems                 1.29     89.7±2.58µs        ? ?/sec      1.00     69.3±3.47µs        ? ?/sec
run_criteria/yes_using_query/086_systems                 1.33     95.6±3.39µs        ? ?/sec      1.00     71.8±3.48µs        ? ?/sec
run_criteria/yes_using_query/091_systems                 1.25    102.0±3.67µs        ? ?/sec      1.00     81.4±4.82µs        ? ?/sec
run_criteria/yes_using_query/096_systems                 1.33    111.7±3.29µs        ? ?/sec      1.00     83.8±4.15µs        ? ?/sec
run_criteria/yes_using_query/101_systems                 1.29   113.2±12.04µs        ? ?/sec      1.00     87.7±5.15µs        ? ?/sec
world_query_for_each/50000_entities_sparse               1.00     47.4±0.51µs        ? ?/sec      1.00     47.3±0.33µs        ? ?/sec
world_query_for_each/50000_entities_table                1.00     27.2±0.50µs        ? ?/sec      1.00     27.2±0.17µs        ? ?/sec
world_query_get/50000_entities_sparse_wide               1.09    210.5±1.78µs        ? ?/sec      1.00    192.5±2.61µs        ? ?/sec
world_query_get/50000_entities_table                     1.00    127.7±2.09µs        ? ?/sec      1.07    136.2±5.95µs        ? ?/sec
world_query_get/50000_entities_table_wide                1.00    209.8±2.37µs        ? ?/sec      1.15    240.6±2.04µs        ? ?/sec
world_query_iter/50000_entities_sparse                   1.00     54.2±0.36µs        ? ?/sec      1.01     54.7±0.61µs        ? ?/sec
world_query_iter/50000_entities_table                    1.00     27.2±0.31µs        ? ?/sec      1.00     27.3±0.64µs        ? ?/sec
```
</details>

NOTE: This PR includes a change to enable LTO on our benchmarks to get a "fully optimized" baseline for our benchmarks. Both the main and the current PR's results were with LTO enabled.
2022-11-04 06:04:55 +00:00
..
benches Speed up Query::get_many and add benchmarks (#6400) 2022-11-01 03:51:41 +00:00
Cargo.toml Remove unnecesary branches/panics from Query accesses (#6461) 2022-11-04 06:04:55 +00:00