2021-09-18 17:02:59 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
import sys, pathlib, time
|
|
|
|
sys.path.append(str(pathlib.Path(__file__).resolve().parents[1]))
|
|
|
|
|
|
|
|
from m1n1.setup import *
|
|
|
|
from m1n1 import asm
|
|
|
|
|
|
|
|
p.smp_start_secondaries()
|
|
|
|
|
|
|
|
tfreq = u.mrs(CNTFRQ_EL0)
|
|
|
|
|
|
|
|
TEST_CPUS = [1, 4]
|
|
|
|
|
|
|
|
CREG = [
|
|
|
|
0x210e00000,
|
|
|
|
0x211e00000,
|
|
|
|
]
|
|
|
|
|
|
|
|
CLUSTER_PSTATE = 0x20020
|
|
|
|
|
|
|
|
MAX_PSTATE = [5, 15]
|
|
|
|
|
|
|
|
# e-core pstates
|
|
|
|
# 600 972 1332 1704 2064
|
|
|
|
# p-core pstates
|
|
|
|
# 600 828 1056 1284 1500 1728 1956 2184 2388 2592 2772 2988 3096 3144 3204
|
|
|
|
|
|
|
|
code = u.malloc(0x1000)
|
|
|
|
|
|
|
|
util = asm.ARMAsm(f"""
|
|
|
|
bench:
|
|
|
|
mrs x1, CNTPCT_EL0
|
|
|
|
1:
|
|
|
|
sub x0, x0, #1
|
|
|
|
cbnz x0, 1b
|
|
|
|
|
|
|
|
mrs x2, CNTPCT_EL0
|
|
|
|
sub x0, x2, x1
|
|
|
|
ret
|
|
|
|
|
|
|
|
signal_and_write:
|
|
|
|
sev
|
|
|
|
mrs x2, CNTPCT_EL0
|
|
|
|
add x2, x2, #0x800
|
|
|
|
1:
|
|
|
|
mrs x3, CNTPCT_EL0
|
|
|
|
sub x4, x3, x2
|
|
|
|
cbnz x4, 1b
|
|
|
|
str x1, [x0]
|
|
|
|
mov x0, x3
|
|
|
|
ret
|
|
|
|
|
|
|
|
timelog:
|
|
|
|
mrs x2, s3_1_c15_c0_0 /* SYS_IMP_APL_PMCR0 */
|
|
|
|
orr x2, x2, #1
|
|
|
|
msr s3_1_c15_c0_0, x2
|
|
|
|
mov x2, #0xffffffffffffffff
|
|
|
|
msr s3_1_c15_c1_0, x2
|
|
|
|
isb
|
|
|
|
wfe
|
|
|
|
1:
|
|
|
|
mrs x2, CNTPCT_EL0
|
|
|
|
mrs x3, s3_2_c15_c0_0
|
|
|
|
isb
|
|
|
|
stp x2, x3, [x0], #16
|
|
|
|
mov x4, #0x40
|
|
|
|
2:
|
|
|
|
sub x4, x4, #1
|
|
|
|
cbnz x4, 2b
|
|
|
|
sub x1, x1, #1
|
|
|
|
cbnz x1, 1b
|
|
|
|
|
2021-10-18 08:13:41 +00:00
|
|
|
ret
|
2021-09-18 17:02:59 +00:00
|
|
|
""", code)
|
|
|
|
iface.writemem(code, util.data)
|
|
|
|
p.dc_cvau(code, len(util.data))
|
|
|
|
p.ic_ivau(code, len(util.data))
|
|
|
|
|
|
|
|
def bench_cpu(idx, loops=10000000):
|
|
|
|
if idx == 0:
|
|
|
|
elapsed = p.call(util.bench, loops) / tfreq
|
|
|
|
else:
|
|
|
|
elapsed = p.smp_call_sync(idx, util.bench, loops) / tfreq
|
|
|
|
if elapsed == 0:
|
|
|
|
return 0
|
|
|
|
mhz = (loops / elapsed) / 1000000
|
|
|
|
return mhz
|
|
|
|
|
|
|
|
def set_pstate(cluster, pstate):
|
|
|
|
p.mask64(CREG[cluster] + CLUSTER_PSTATE, 0xf00f, (1<<25) | pstate | (pstate << 12))
|
|
|
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
LOG_ITERS = 10000
|
|
|
|
logbuf = u.malloc(LOG_ITERS * 16)
|
|
|
|
|
|
|
|
def bench_latency(cluster, cpu, from_pstate, to_pstate, verbose=False):
|
|
|
|
set_pstate(cluster, from_pstate)
|
|
|
|
bench_cpu(cpu)
|
|
|
|
|
|
|
|
p.smp_call(cpu, util.timelog, logbuf, LOG_ITERS)
|
|
|
|
psreg = (p.read64(CREG[cluster] + CLUSTER_PSTATE) & ~0xf00f) | (1<<25) | to_pstate | (to_pstate << 12)
|
|
|
|
tval = p.call(util.signal_and_write, CREG[cluster] + CLUSTER_PSTATE, psreg)
|
|
|
|
p.smp_wait(cpu)
|
|
|
|
|
|
|
|
logdata = iface.readmem(logbuf, LOG_ITERS * 16)
|
|
|
|
lts, lcyc = None, None
|
|
|
|
|
|
|
|
log = []
|
|
|
|
for i in range(LOG_ITERS):
|
|
|
|
ts, cyc = struct.unpack("<QQ", logdata [i*16:i*16+16])
|
|
|
|
log.append((ts, cyc))
|
|
|
|
|
|
|
|
off = 256
|
|
|
|
|
|
|
|
ts_0, cyc_0 = log[off]
|
|
|
|
ts_e, cyc_e = log[-1]
|
|
|
|
f_init = None
|
|
|
|
f_end = None
|
|
|
|
lts, lcyc = ts_0, cyc_0
|
|
|
|
|
|
|
|
inc = to_pstate > from_pstate
|
|
|
|
|
|
|
|
blip = 0
|
|
|
|
cnt = dts_sum = 0
|
|
|
|
for i in range(off, len(log)):
|
|
|
|
ts, cyc = log[i]
|
|
|
|
dts = ts - lts
|
|
|
|
dcyc = cyc - lcyc
|
|
|
|
|
|
|
|
cnt += 1
|
|
|
|
dts_sum += dts
|
|
|
|
|
|
|
|
blip = max(blip, dts)
|
|
|
|
|
|
|
|
if f_init is None and ts > tval:
|
|
|
|
tidx = i
|
|
|
|
f_init = (lcyc - cyc_0) / (lts - ts_0) * tfreq / 1000000
|
|
|
|
dts_init = dts_sum / cnt
|
|
|
|
if f_end is None and ts > (tval + ts_e) / 2:
|
|
|
|
f_end = (cyc_e - cyc) / (ts_e - ts) * tfreq / 1000000
|
|
|
|
cnt = dts_sum = 0
|
|
|
|
|
|
|
|
#if lts is not None:
|
|
|
|
#print(f"{i}: {ts}: {cyc} ({ts-lts}: {cyc-lcyc})")
|
|
|
|
#else:
|
|
|
|
#print(f"{i}: {ts}: {cyc}")
|
|
|
|
lts, lcyc = ts, cyc
|
|
|
|
|
|
|
|
dts_end = dts_sum / cnt
|
|
|
|
|
|
|
|
window = 32
|
|
|
|
|
|
|
|
if verbose:
|
|
|
|
print(f"Triggered at {tval}")
|
|
|
|
|
|
|
|
thresh = 2/ (1/f_init + 1/f_end)
|
|
|
|
|
|
|
|
for i in range(tidx, LOG_ITERS - window - 1):
|
|
|
|
ts0, cyc0 = log[i - window]
|
|
|
|
ts1, cyc1 = log[i + window]
|
|
|
|
f = (cyc1 - cyc0) / (ts1 - ts0) * tfreq / 1000000
|
|
|
|
if inc and (f > thresh) or ((not inc) and f < thresh):
|
|
|
|
tts = log[i][0]
|
|
|
|
tidx = i
|
|
|
|
if verbose:
|
|
|
|
print(f"Frequency transition at #{i} {tts}")
|
|
|
|
break
|
|
|
|
|
|
|
|
if verbose:
|
|
|
|
print(f"Initial frequency: {f_init:.2f}")
|
|
|
|
print(f"Final frequency: {f_end:.2f}")
|
|
|
|
print(f"Threshold: {thresh:.2f}")
|
|
|
|
|
|
|
|
for i in range(max(window, tidx - 10 * window), tidx + 10 * window):
|
|
|
|
ts0, cyc0 = log[i - window]
|
|
|
|
ts1, cyc1 = log[i + window]
|
|
|
|
lts, lcyc = log[i - 1]
|
|
|
|
ts, cyc = log[i]
|
|
|
|
f = (cyc1 - cyc0) / (ts1 - ts0) * tfreq / 1000000
|
|
|
|
print(f"{i}: {ts}: {cyc} ({ts-lts}: {cyc-lcyc}): {f:.2f}")
|
|
|
|
|
|
|
|
blip -= min(dts_init, dts_end)
|
|
|
|
|
|
|
|
return (tts - tval) / tfreq * 1000000000, blip / tfreq * 1000000000
|
|
|
|
|
|
|
|
for cluster, creg in enumerate(CREG):
|
|
|
|
cpu = TEST_CPUS[cluster]
|
|
|
|
|
|
|
|
freqs = []
|
|
|
|
|
|
|
|
print(f"#### Cluster {cluster} ####")
|
|
|
|
print(" P-States:")
|
|
|
|
print(" ", end="")
|
|
|
|
for pstate in range(MAX_PSTATE[cluster] + 1):
|
|
|
|
set_pstate(cluster, pstate)
|
|
|
|
freq = int(round(bench_cpu(cpu)))
|
|
|
|
freqs.append(freq)
|
|
|
|
print(f"{pstate}:{freq}MHz", end=" ")
|
|
|
|
print()
|
|
|
|
print()
|
|
|
|
|
|
|
|
print(" To-> |", end="")
|
|
|
|
for to_pstate in range(1, MAX_PSTATE[cluster] + 1):
|
|
|
|
print(f" {freqs[to_pstate]:7d} |", end="")
|
|
|
|
print()
|
|
|
|
print(" From |", end="")
|
|
|
|
for to_pstate in range(1, MAX_PSTATE[cluster] + 1):
|
|
|
|
print(f"---------+", end="")
|
|
|
|
print()
|
|
|
|
|
|
|
|
maxblip = 0
|
|
|
|
|
|
|
|
for from_pstate in range(1, MAX_PSTATE[cluster] + 1):
|
|
|
|
print(f" {freqs[from_pstate]:4d} |", end="")
|
|
|
|
for to_pstate in range(1, MAX_PSTATE[cluster] + 1):
|
|
|
|
if from_pstate == to_pstate:
|
|
|
|
print(f" ******* |", end="")
|
|
|
|
continue
|
|
|
|
lat, blip = bench_latency(cluster, cpu, from_pstate, to_pstate)
|
|
|
|
print(f" {lat:7.0f} |", end="")
|
|
|
|
maxblip = max(maxblip, blip)
|
|
|
|
print()
|
|
|
|
|
|
|
|
print()
|
|
|
|
print(f"Maximum execution latency spike: {maxblip:.0f} ns")
|
|
|
|
print()
|
|
|
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
#bench_latency(1, TEST_CPUS[1], 15, 14, True)
|
|
|
|
|
|
|
|
|