#!/usr/bin/env python3 # SPDX-License-Identifier: MIT import sys, pathlib, time sys.path.append(str(pathlib.Path(__file__).resolve().parents[1])) from m1n1.setup import * from m1n1 import asm p.smp_start_secondaries() tfreq = u.mrs(CNTFRQ_EL0) TEST_CPUS = [1, 4] CREG = [ 0x210e00000, 0x211e00000, ] CLUSTER_PSTATE = 0x20020 MAX_PSTATE = [5, 15] # e-core pstates # 600 972 1332 1704 2064 # p-core pstates # 600 828 1056 1284 1500 1728 1956 2184 2388 2592 2772 2988 3096 3144 3204 code = u.malloc(0x1000) util = asm.ARMAsm(f""" bench: mrs x1, CNTPCT_EL0 1: sub x0, x0, #1 cbnz x0, 1b mrs x2, CNTPCT_EL0 sub x0, x2, x1 ret signal_and_write: sev mrs x2, CNTPCT_EL0 add x2, x2, #0x800 1: mrs x3, CNTPCT_EL0 sub x4, x3, x2 cbnz x4, 1b str x1, [x0] mov x0, x3 ret timelog: mrs x2, s3_1_c15_c0_0 /* SYS_IMP_APL_PMCR0 */ orr x2, x2, #1 msr s3_1_c15_c0_0, x2 mov x2, #0xffffffffffffffff msr s3_1_c15_c1_0, x2 isb wfe 1: mrs x2, CNTPCT_EL0 mrs x3, s3_2_c15_c0_0 isb stp x2, x3, [x0], #16 mov x4, #0x40 2: sub x4, x4, #1 cbnz x4, 2b sub x1, x1, #1 cbnz x1, 1b ret """, code) iface.writemem(code, util.data) p.dc_cvau(code, len(util.data)) p.ic_ivau(code, len(util.data)) def bench_cpu(idx, loops=10000000): if idx == 0: elapsed = p.call(util.bench, loops) / tfreq else: elapsed = p.smp_call_sync(idx, util.bench, loops) / tfreq if elapsed == 0: return 0 mhz = (loops / elapsed) / 1000000 return mhz def set_pstate(cluster, pstate): p.mask64(CREG[cluster] + CLUSTER_PSTATE, 0xf00f, (1<<25) | pstate | (pstate << 12)) print() LOG_ITERS = 10000 logbuf = u.malloc(LOG_ITERS * 16) def bench_latency(cluster, cpu, from_pstate, to_pstate, verbose=False): set_pstate(cluster, from_pstate) bench_cpu(cpu) p.smp_call(cpu, util.timelog, logbuf, LOG_ITERS) psreg = (p.read64(CREG[cluster] + CLUSTER_PSTATE) & ~0xf00f) | (1<<25) | to_pstate | (to_pstate << 12) tval = p.call(util.signal_and_write, CREG[cluster] + CLUSTER_PSTATE, psreg) p.smp_wait(cpu) logdata = iface.readmem(logbuf, LOG_ITERS * 16) lts, lcyc = None, None log = [] for i in range(LOG_ITERS): ts, cyc = struct.unpack(" from_pstate blip = 0 cnt = dts_sum = 0 for i in range(off, len(log)): ts, cyc = log[i] dts = ts - lts dcyc = cyc - lcyc cnt += 1 dts_sum += dts blip = max(blip, dts) if f_init is None and ts > tval: tidx = i f_init = (lcyc - cyc_0) / (lts - ts_0) * tfreq / 1000000 dts_init = dts_sum / cnt if f_end is None and ts > (tval + ts_e) / 2: f_end = (cyc_e - cyc) / (ts_e - ts) * tfreq / 1000000 cnt = dts_sum = 0 #if lts is not None: #print(f"{i}: {ts}: {cyc} ({ts-lts}: {cyc-lcyc})") #else: #print(f"{i}: {ts}: {cyc}") lts, lcyc = ts, cyc dts_end = dts_sum / cnt window = 32 if verbose: print(f"Triggered at {tval}") thresh = 2/ (1/f_init + 1/f_end) for i in range(tidx, LOG_ITERS - window - 1): ts0, cyc0 = log[i - window] ts1, cyc1 = log[i + window] f = (cyc1 - cyc0) / (ts1 - ts0) * tfreq / 1000000 if inc and (f > thresh) or ((not inc) and f < thresh): tts = log[i][0] tidx = i if verbose: print(f"Frequency transition at #{i} {tts}") break if verbose: print(f"Initial frequency: {f_init:.2f}") print(f"Final frequency: {f_end:.2f}") print(f"Threshold: {thresh:.2f}") for i in range(max(window, tidx - 10 * window), tidx + 10 * window): ts0, cyc0 = log[i - window] ts1, cyc1 = log[i + window] lts, lcyc = log[i - 1] ts, cyc = log[i] f = (cyc1 - cyc0) / (ts1 - ts0) * tfreq / 1000000 print(f"{i}: {ts}: {cyc} ({ts-lts}: {cyc-lcyc}): {f:.2f}") blip -= min(dts_init, dts_end) return (tts - tval) / tfreq * 1000000000, blip / tfreq * 1000000000 for cluster, creg in enumerate(CREG): cpu = TEST_CPUS[cluster] freqs = [] print(f"#### Cluster {cluster} ####") print(" P-States:") print(" ", end="") for pstate in range(MAX_PSTATE[cluster] + 1): set_pstate(cluster, pstate) freq = int(round(bench_cpu(cpu))) freqs.append(freq) print(f"{pstate}:{freq}MHz", end=" ") print() print() print(" To-> |", end="") for to_pstate in range(1, MAX_PSTATE[cluster] + 1): print(f" {freqs[to_pstate]:7d} |", end="") print() print(" From |", end="") for to_pstate in range(1, MAX_PSTATE[cluster] + 1): print(f"---------+", end="") print() maxblip = 0 for from_pstate in range(1, MAX_PSTATE[cluster] + 1): print(f" {freqs[from_pstate]:4d} |", end="") for to_pstate in range(1, MAX_PSTATE[cluster] + 1): if from_pstate == to_pstate: print(f" ******* |", end="") continue lat, blip = bench_latency(cluster, cpu, from_pstate, to_pstate) print(f" {lat:7.0f} |", end="") maxblip = max(maxblip, blip) print() print() print(f"Maximum execution latency spike: {maxblip:.0f} ns") print() print() #bench_latency(1, TEST_CPUS[1], 15, 14, True)