m1n1/ane: Initial commit

Signed-off-by: Eileen Yoon <eyn@gmx.com>
This commit is contained in:
Eileen Yoon 2023-03-24 23:03:01 +09:00 committed by Hector Martin
parent a633b90634
commit 6ce14d8735
3 changed files with 437 additions and 0 deletions

43
proxyclient/experiments/ane.py Executable file
View file

@ -0,0 +1,43 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
import sys, pathlib
sys.path.append(str(pathlib.Path(__file__).resolve().parents[1]))
from m1n1.setup import *
from m1n1.shell import run_shell
from m1n1.fw.ane import ANE
import numpy as np
from anect import anect_convert # pip install anect
def f16encode(x): return np.float16(x).tobytes()
def f16decode(b): return np.frombuffer(b[:2], dtype=np.float16)[0]
ane = ANE(u)
ane.power_up()
if 1:
rnges = [(0x26bc04000, 0x26bc28000, 'engine'),]
mon = RegMonitor(u)
for (start, end, name) in rnges:
mon.add(start, end-start, name=name)
mon.poll() # should work after ane.power_up()
# curl -LJO https://www.dropbox.com/s/lpjap6w0kdlom1h/add.hwx?dl=0
anec = anect_convert("add.hwx")
req = ane.fw.setup(anec)
x1 = f16encode(1.0)
x2 = f16encode(2.0)
ane.fw.send_src(req, x1, 0)
ane.fw.send_src(req, x2, 1)
ane.tm.enqueue_tq(req)
ane.tm.execute_tq(req)
x3 = ane.fw.read_dst(req, 0)
print("what's 1+2? = %f" % f16decode(x3[:2]))
run_shell(globals(), msg="Have fun!")

197
proxyclient/m1n1/fw/ane.py Normal file
View file

@ -0,0 +1,197 @@
# SPDX-License-Identifier: MIT
import struct
from m1n1.hw.dart import DART
from m1n1.hw.ane import ANERegs, ANEDARTRegs, ANETaskManager
class ANE:
PAGE_SIZE = 0x4000
TILE_SIZE = 0x4000
def __init__(self, u):
self.u = u
self.p = u.proxy
self.name = "ane"
self.p.pmgr_adt_clocks_enable(f'/arm-io/{self.name}')
self.p.pmgr_adt_clocks_enable(f'/arm-io/dart-{self.name}')
self.base_addr = u.adt[f'arm-io/{self.name}'].get_reg(0)[0]
self.regs = ANERegs(self.u, self.base_addr)
self.apply_static_tunables()
ps_map = {
"ane": 0x023b70c000,
"ane0": 0x028e08c000,
"ane1": 0x028e684000,
"ane2": 0x228e08c000,
"ane3": 0x228e684000,
}
self.ps_base_addr = ps_map[self.name]
# we need a slight patch to dart
self.dart = DART.from_adt(u, path=f'/arm-io/dart-{self.name}',
instance=0, iova_range=(0x4000, 0xe0000000))
self.dart.initialize()
dart_regs = []
for prop in range(3):
dart_addr = self.u.adt[f'/arm-io/dart-{self.name}'].get_reg(prop)[0]
dart_regs.append(ANEDARTRegs(self.u, dart_addr))
self.dart_regs = dart_regs
# hack to initialize base ttbr
phys = self.u.memalign(self.PAGE_SIZE, self.PAGE_SIZE)
self.dart.iomap_at(0, 0x0, phys, self.PAGE_SIZE)
self.ttbr0_addr = self.dart.regs.TTBR[0, 0].val
self.dart_regs[1].TTBR[0, 0].val = self.ttbr0_addr # DMA fails w/o
self.dart_regs[2].TTBR[0, 0].val = self.ttbr0_addr # DMA fails w/o
self.allocator = ANEAllocator(self)
self.fw = ANEFirmware(self)
self.tm = ANETaskManager(self)
def apply_static_tunables(self): # this cost me a solid week
static_tunables_map = [
(0x0, 0x10), (0x38, 0x50020), (0x3c, 0xa0030),
(0x400, 0x40010001), (0x600, 0x1ffffff),
(0x738, 0x200020), (0x798, 0x100030),
(0x7f8, 0x100000a), (0x900, 0x101), (0x410, 0x1100),
(0x420, 0x1100), (0x430, 0x1100)]
for (offset, value) in static_tunables_map:
self.p.write32(self.base_addr + offset, value)
def power_up(self):
self.p.pmgr_adt_clocks_enable(f'/arm-io/{self.name}')
self.p.pmgr_adt_clocks_enable(f'/arm-io/dart-{self.name}')
self.power_down()
for offset in range(0x0, 0x30+0x8, 0x8):
self.p.write32(self.ps_base_addr + offset, 0xf)
self.tm.reset()
def power_down(self):
for offset in reversed(range(0x0, 0x30+0x8, 0x8)):
self.p.write32(self.ps_base_addr + offset, 0x300)
def ioread(self, iova, size):
return self.dart.ioread(0, iova & 0xffffffff, size)
def iowrite(self, iova, buf):
self.dart.iowrite(0, iova & 0xffffffff, buf)
def round_up(self, x, y): return ((x+(y-1)) & (-y))
class ANEBuffer:
def __init__(self, mapid, phys, iova, size):
self.mapid = mapid
self.phys = phys
self.iova = iova
self.size = size
class ANEAllocator:
def __init__(self, ane):
self.ane = ane
self.mapid = 0
self.map = {}
def alloc_size(self, size):
size = self.ane.round_up(size, self.ane.PAGE_SIZE)
phys = self.ane.u.memalign(self.ane.PAGE_SIZE, size)
self.ane.p.memset32(phys, 0, size)
iova = self.ane.dart.iomap(0, phys, size)
buf = ANEBuffer(self.mapid, phys, iova, size)
self.map[self.mapid] = buf
print("mapid %d: mapped phys 0x%x to iova 0x%06x for data w/ size 0x%06x"
% (buf.mapid, buf.phys, buf.iova, buf.size))
self.mapid += 1
return buf.iova
def alloc_data(self, data):
iova = self.alloc_size(len(data))
self.ane.iowrite(iova, data)
return iova
def dump_map(self):
for mapid in self.map:
buf = self.map[mapid]
print('mapid %d: phys 0x%x, iova 0x%x, size 0x%x'
% (buf.mapid, buf.phys, buf.iova, buf.size))
class ANEEngineReq:
def __init__(self, anec):
self.anec = anec
self.td_size = 0
self.td_count = 0
self.fifo_iova = 0
self.nid = 0
self.qid = 0
self.bar = [0x0] * 0x20
class ANEFirmware:
FIFO_NID = 0x40
FIFO_COUNT = 0x20
FIFO_WIDTH = 0x400 # nextpow2(0x274)
def __init__(self, ane):
self.ane = ane
def setup(self, anec):
req = ANEEngineReq(anec)
req.td_size = anec.td_size
req.td_count = anec.td_count
# setup immutable bar
tsk_buf = anec.data[anec.tsk_start:anec.tsk_start+anec.tsk_size]
req.bar[0] = self.ane.allocator.alloc_data(tsk_buf)
krn_start = anec.tsk_start + self.ane.round_up(anec.tsk_size, 0x10)
krn_buf = anec.data[krn_start:krn_start+anec.krn_size]
req.bar[1] = self.ane.allocator.alloc_data(krn_buf)
# setup mutable bar
for bdx in range(0x20):
if ((anec.tiles[bdx]) and (bdx >= 3)):
size = anec.tiles[bdx] * self.ane.TILE_SIZE
req.bar[bdx] = self.ane.allocator.alloc_size(size)
self.make_fifo(req)
return req
def make_fifo(self, req):
anec = req.anec
pool_size = self.ane.round_up(self.FIFO_WIDTH * 2, self.ane.TILE_SIZE)
fifo_iova = self.ane.allocator.alloc_size(pool_size)
td_buf = anec.data[anec.tsk_start:anec.tsk_start+anec.td_size]
fifo_head = self.set_nid(td_buf, self.FIFO_NID)
fifo_tail = self.set_nid(td_buf, self.FIFO_NID + self.FIFO_COUNT)
self.ane.iowrite(fifo_iova, fifo_head)
self.ane.iowrite(fifo_iova + self.FIFO_WIDTH, fifo_tail)
req.fifo_iova = fifo_iova
req.nid = self.FIFO_NID
req.qid = 4 # just the default queue
def set_nid(self, td_buf, nid):
hdr0 = struct.unpack('<L', td_buf[:4])[0]
hdr0 = (hdr0 & 0xf00ffff) | ((nid & 0xff) << 16)
return struct.pack('<L', hdr0) + td_buf[4:]
def send_src(self, req, src_buf, idx):
iova = req.bar[4 + req.anec.dst_count + idx]
size = req.anec.src_sizes[idx]
if (len(src_buf) < size):
src_buf += b''*(size - len(src_buf))
self.ane.iowrite(iova, src_buf[:size])
def read_dst(self, req, idx):
iova = req.bar[4 + idx]
size = req.anec.dst_sizes[idx]
return self.ane.ioread(iova, size)

197
proxyclient/m1n1/hw/ane.py Normal file
View file

@ -0,0 +1,197 @@
# SPDX-License-Identifier: MIT
from ..utils import *
from .dart import DARTRegs
import time
class ANERegs(RegMap):
PMGR1 = 0x738, Register32
PMGR2 = 0x798, Register32
PMGR3 = 0x7f8, Register32
ASC_IO_RVBAR = 0x1050000, Register32
ASC_EDPRCR = 0x1010310, Register32
# 24hz clocks, counter at +4
CLK0 = 0x1160008, Register32
CTR0 = 0x116000c, Register32
CLK1 = 0x1168008, Register32
CTR1 = 0x116800c, Register32
CLK2 = 0x1170000, Register32
CTR2 = 0x1170004, Register32
CLK3 = 0x1178000, Register32
CTR3 = 0x1178004, Register32
VERS = 0x1840000, Register32
# for acks w/ rtkit
GPIO0 = 0x1840048, Register32
GPIO1 = 0x184004c, Register32
GPIO2 = 0x1840050, Register32
GPIO3 = 0x1840054, Register32
GPIO4 = 0x1840058, Register32
GPIO5 = 0x184005c, Register32
GPIO6 = 0x1840060, Register32
GPIO7 = 0x1840064, Register32
class ANEDARTRegs(DARTRegs):
UNK_CONFIG_68 = 0x68, Register32
UNK_CONFIG_6c = 0x6c, Register32
class R_TQINFO(Register32):
UNK = 31, 16
NID = 15, 0
class TaskQueue(RegMap):
STATUS = irange(0x00, 8, 0x148), Register32
PRTY = irange(0x10, 8, 0x148), Register32
FREE_SPACE = irange(0x14, 8, 0x148), Register32
TQINFO = irange(0x1c, 8, 0x148), R_TQINFO
BAR1 = (irange(0x20, 8, 0x148), irange(0x0, 0x20, 4)), Register32
REQ_NID1 = irange(0xa0, 8, 0x148), Register32
REQ_SIZE2 = irange(0xa4, 8, 0x148), Register32
REQ_ADDR2 = irange(0xa8, 8, 0x148), Register32
BAR2 = (irange(0xac, 8, 0x148), irange(0x0, 0x20, 4)), Register32
REQ_NID2 = irange(0x12c, 8, 0x148), Register32
REQ_SIZE1 = irange(0x130, 8, 0x148), Register32
REQ_ADDR1 = irange(0x134, 8, 0x148), Register32
class R_REQINFO(Register32):
TDSIZE = 31, 16
TDCOUNT = 15, 0
class R_IRQINFO(Register32):
CNT = 31, 24
NID = 23, 16
UNK1 = 15, 8
UNK2 = 7, 0
class TMRegs(RegMap):
REQ_ADDR = 0x0, Register32
REQ_INFO = 0x4, R_REQINFO
REQ_PUSH = 0x8, Register32
TQ_EN = 0xc, Register32
IRQ_EVT1_CNT = 0x14, Register32
IRQ_EVT1_DAT_INFO = 0x18, R_IRQINFO
IRQ_EVT1_DAT_UNK1 = 0x1c, Register32
IRQ_EVT1_DAT_TIME = 0x20, Register32
IRQ_EVT1_DAT_UNK2 = 0x24, Register32
IRQ_EVT2_CNT = 0x28, Register32
IRQ_EVT2_DAT_INFO = 0x2c, R_IRQINFO
IRQ_EVT2_DAT_UNK1 = 0x30, Register32
IRQ_EVT2_DAT_TIME = 0x34, Register32
IRQ_EVT2_DAT_UNK2 = 0x38, Register32
COMMIT_INFO = 0x44, Register32
TM_STATUS = 0x54, Register32
UNK_IRQ_EN1 = 0x68, Register32
UNK_IRQ_ACK = 0x6c, Register32
UNK_IRQ_EN2 = 0x70, Register32
class ANETaskManager:
TQ_COUNT = 8
TQ_WIDTH = 0x148
tq_prty = (0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x1e, 0x1f)
def __init__(self, ane):
self.u = ane.u
self.p = ane.p
self.TM_BASE_ADDR = ane.base_addr + 0x1c00000 + 0x24000
self.TQ_BASE_ADDR = ane.base_addr + 0x1c00000 + 0x25000
self.regs = TMRegs(self.u, self.TM_BASE_ADDR)
self.tq = TaskQueue(self.u, self.TQ_BASE_ADDR)
def reset(self): # these reset with ANE_SET pds
self.regs.TQ_EN.val = 0x3000
# set priority param for each queue
for qid, prty in enumerate(self.tq_prty):
self.tq.PRTY[qid].val = self.tq_prty[qid]
self.regs.UNK_IRQ_EN1.val = 0x4000000 # enable irq
self.regs.UNK_IRQ_EN2.val = 0x6 # enable irq
def enqueue_tq(self, req):
qid = req.qid
if not ((qid >= 1) and (qid < self.TQ_COUNT)):
raise ValueError('1 <= qid <= 7')
if not (self.tq.PRTY[qid].val == self.tq_prty[qid]):
raise ValueError('invalid priority setup for tq %d' % qid)
print('enqueueing task w/ fifo 0x%x to tq %d' % (req.fifo_iova, qid))
self.tq.STATUS[qid].val = 0x1 # in use
for bdx, iova in enumerate(req.bar):
if (iova):
print("bar %d: 0x%x" % (bdx, iova))
self.tq.BAR1[qid, bdx].val = iova
self.tq.REQ_SIZE1[qid].val = ((req.td_size << 0xe) + 0x1ff0000) & 0x1ff0000
self.tq.REQ_ADDR1[qid].val = req.fifo_iova & 0xffffffff
self.tq.REQ_NID1[qid].val = (req.nid & 0xff) << 8 | 1
def execute_tq(self, req):
qid = req.qid
print('arbitered tq %d; pushing to execution queue...' % qid)
# transfer to main queue (now in in TM range)
self.regs.REQ_ADDR.val = self.tq.REQ_ADDR1[qid].val
# doesnt go through if 0
self.regs.REQ_INFO.val = self.tq.REQ_SIZE1[qid].val | req.td_count
# let's do magic
self.regs.REQ_PUSH.val = self.tq_prty[qid] | (qid & 7) << 8
self.get_tm_status()
self.get_committed_info()
self.irq_handler()
self.tq.STATUS[qid].val = 0x0 # done
def get_tm_status(self, max_timeouts=100, interval=0.01):
for n in range(max_timeouts):
status = self.regs.TM_STATUS.val
success = (status & 1) != 0
print('tm status: 0x%x, success: %r' % (status, success))
if (success):
return success
time.sleep(interval)
print('timeout, tm is non-idle! status: 0x%x' % status)
return success
def get_committed_info(self):
committed_nid = self.regs.COMMIT_INFO.val >> 0x10 & 0xff
print('pushed td w/ nid 0x%x to execution' % committed_nid)
def irq_handler(self):
line = 0
evtcnt = self.regs.IRQ_EVT1_CNT.val
print('irq handler: LINE %d EVTCNT: %d' % (line, evtcnt))
for evt_n in range(evtcnt): # needs to be cleared
info = self.regs.IRQ_EVT1_DAT_INFO.val
unk1 = self.regs.IRQ_EVT1_DAT_UNK1.val
tmstmp = self.regs.IRQ_EVT1_DAT_TIME.val
unk2 = self.regs.IRQ_EVT1_DAT_UNK2.val
print('irq handler: LINE %d EVT %d: executed info 0x%x @ 0x%x'
% (line, evt_n, info, tmstmp))
self.regs.UNK_IRQ_ACK.val = self.regs.UNK_IRQ_ACK.val | 2
line = 1
evtcnt = self.regs.IRQ_EVT2_CNT.val
print('irq handler: LINE %d EVTCNT: %d' % (line, evtcnt))
for evt_n in range(evtcnt): # needs to be cleared
info = self.regs.IRQ_EVT2_DAT_INFO.val
unk1 = self.regs.IRQ_EVT2_DAT_UNK1.val
tmstmp = self.regs.IRQ_EVT2_DAT_TIME.val
unk2 = self.regs.IRQ_EVT2_DAT_UNK2.val
print('irq handler: LINE %d EVT %d: executed info 0x%x @ 0x%x'
% (line, evt_n, info, tmstmp))