mirror of
https://github.com/AsahiLinux/m1n1
synced 2024-11-25 16:10:16 +00:00
m1n1/ane: Initial commit
Signed-off-by: Eileen Yoon <eyn@gmx.com>
This commit is contained in:
parent
a633b90634
commit
6ce14d8735
3 changed files with 437 additions and 0 deletions
43
proxyclient/experiments/ane.py
Executable file
43
proxyclient/experiments/ane.py
Executable file
|
@ -0,0 +1,43 @@
|
|||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: MIT
|
||||
import sys, pathlib
|
||||
sys.path.append(str(pathlib.Path(__file__).resolve().parents[1]))
|
||||
|
||||
from m1n1.setup import *
|
||||
from m1n1.shell import run_shell
|
||||
|
||||
from m1n1.fw.ane import ANE
|
||||
|
||||
import numpy as np
|
||||
from anect import anect_convert # pip install anect
|
||||
def f16encode(x): return np.float16(x).tobytes()
|
||||
def f16decode(b): return np.frombuffer(b[:2], dtype=np.float16)[0]
|
||||
|
||||
|
||||
ane = ANE(u)
|
||||
ane.power_up()
|
||||
if 1:
|
||||
rnges = [(0x26bc04000, 0x26bc28000, 'engine'),]
|
||||
mon = RegMonitor(u)
|
||||
for (start, end, name) in rnges:
|
||||
mon.add(start, end-start, name=name)
|
||||
mon.poll() # should work after ane.power_up()
|
||||
|
||||
# curl -LJO https://www.dropbox.com/s/lpjap6w0kdlom1h/add.hwx?dl=0
|
||||
anec = anect_convert("add.hwx")
|
||||
req = ane.fw.setup(anec)
|
||||
|
||||
x1 = f16encode(1.0)
|
||||
x2 = f16encode(2.0)
|
||||
ane.fw.send_src(req, x1, 0)
|
||||
ane.fw.send_src(req, x2, 1)
|
||||
|
||||
ane.tm.enqueue_tq(req)
|
||||
ane.tm.execute_tq(req)
|
||||
|
||||
x3 = ane.fw.read_dst(req, 0)
|
||||
print("what's 1+2? = %f" % f16decode(x3[:2]))
|
||||
|
||||
|
||||
run_shell(globals(), msg="Have fun!")
|
||||
|
197
proxyclient/m1n1/fw/ane.py
Normal file
197
proxyclient/m1n1/fw/ane.py
Normal file
|
@ -0,0 +1,197 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
|
||||
import struct
|
||||
|
||||
from m1n1.hw.dart import DART
|
||||
from m1n1.hw.ane import ANERegs, ANEDARTRegs, ANETaskManager
|
||||
|
||||
|
||||
class ANE:
|
||||
|
||||
PAGE_SIZE = 0x4000
|
||||
TILE_SIZE = 0x4000
|
||||
|
||||
def __init__(self, u):
|
||||
self.u = u
|
||||
self.p = u.proxy
|
||||
|
||||
self.name = "ane"
|
||||
self.p.pmgr_adt_clocks_enable(f'/arm-io/{self.name}')
|
||||
self.p.pmgr_adt_clocks_enable(f'/arm-io/dart-{self.name}')
|
||||
|
||||
self.base_addr = u.adt[f'arm-io/{self.name}'].get_reg(0)[0]
|
||||
self.regs = ANERegs(self.u, self.base_addr)
|
||||
self.apply_static_tunables()
|
||||
|
||||
ps_map = {
|
||||
"ane": 0x023b70c000,
|
||||
"ane0": 0x028e08c000,
|
||||
"ane1": 0x028e684000,
|
||||
"ane2": 0x228e08c000,
|
||||
"ane3": 0x228e684000,
|
||||
}
|
||||
self.ps_base_addr = ps_map[self.name]
|
||||
|
||||
# we need a slight patch to dart
|
||||
self.dart = DART.from_adt(u, path=f'/arm-io/dart-{self.name}',
|
||||
instance=0, iova_range=(0x4000, 0xe0000000))
|
||||
self.dart.initialize()
|
||||
dart_regs = []
|
||||
for prop in range(3):
|
||||
dart_addr = self.u.adt[f'/arm-io/dart-{self.name}'].get_reg(prop)[0]
|
||||
dart_regs.append(ANEDARTRegs(self.u, dart_addr))
|
||||
self.dart_regs = dart_regs
|
||||
|
||||
# hack to initialize base ttbr
|
||||
phys = self.u.memalign(self.PAGE_SIZE, self.PAGE_SIZE)
|
||||
self.dart.iomap_at(0, 0x0, phys, self.PAGE_SIZE)
|
||||
self.ttbr0_addr = self.dart.regs.TTBR[0, 0].val
|
||||
self.dart_regs[1].TTBR[0, 0].val = self.ttbr0_addr # DMA fails w/o
|
||||
self.dart_regs[2].TTBR[0, 0].val = self.ttbr0_addr # DMA fails w/o
|
||||
|
||||
self.allocator = ANEAllocator(self)
|
||||
self.fw = ANEFirmware(self)
|
||||
self.tm = ANETaskManager(self)
|
||||
|
||||
def apply_static_tunables(self): # this cost me a solid week
|
||||
static_tunables_map = [
|
||||
(0x0, 0x10), (0x38, 0x50020), (0x3c, 0xa0030),
|
||||
(0x400, 0x40010001), (0x600, 0x1ffffff),
|
||||
(0x738, 0x200020), (0x798, 0x100030),
|
||||
(0x7f8, 0x100000a), (0x900, 0x101), (0x410, 0x1100),
|
||||
(0x420, 0x1100), (0x430, 0x1100)]
|
||||
for (offset, value) in static_tunables_map:
|
||||
self.p.write32(self.base_addr + offset, value)
|
||||
|
||||
def power_up(self):
|
||||
self.p.pmgr_adt_clocks_enable(f'/arm-io/{self.name}')
|
||||
self.p.pmgr_adt_clocks_enable(f'/arm-io/dart-{self.name}')
|
||||
self.power_down()
|
||||
for offset in range(0x0, 0x30+0x8, 0x8):
|
||||
self.p.write32(self.ps_base_addr + offset, 0xf)
|
||||
self.tm.reset()
|
||||
|
||||
def power_down(self):
|
||||
for offset in reversed(range(0x0, 0x30+0x8, 0x8)):
|
||||
self.p.write32(self.ps_base_addr + offset, 0x300)
|
||||
|
||||
def ioread(self, iova, size):
|
||||
return self.dart.ioread(0, iova & 0xffffffff, size)
|
||||
|
||||
def iowrite(self, iova, buf):
|
||||
self.dart.iowrite(0, iova & 0xffffffff, buf)
|
||||
|
||||
def round_up(self, x, y): return ((x+(y-1)) & (-y))
|
||||
|
||||
|
||||
class ANEBuffer:
|
||||
def __init__(self, mapid, phys, iova, size):
|
||||
self.mapid = mapid
|
||||
self.phys = phys
|
||||
self.iova = iova
|
||||
self.size = size
|
||||
|
||||
|
||||
class ANEAllocator:
|
||||
def __init__(self, ane):
|
||||
self.ane = ane
|
||||
self.mapid = 0
|
||||
self.map = {}
|
||||
|
||||
def alloc_size(self, size):
|
||||
size = self.ane.round_up(size, self.ane.PAGE_SIZE)
|
||||
phys = self.ane.u.memalign(self.ane.PAGE_SIZE, size)
|
||||
self.ane.p.memset32(phys, 0, size)
|
||||
iova = self.ane.dart.iomap(0, phys, size)
|
||||
|
||||
buf = ANEBuffer(self.mapid, phys, iova, size)
|
||||
self.map[self.mapid] = buf
|
||||
print("mapid %d: mapped phys 0x%x to iova 0x%06x for data w/ size 0x%06x"
|
||||
% (buf.mapid, buf.phys, buf.iova, buf.size))
|
||||
self.mapid += 1
|
||||
return buf.iova
|
||||
|
||||
def alloc_data(self, data):
|
||||
iova = self.alloc_size(len(data))
|
||||
self.ane.iowrite(iova, data)
|
||||
return iova
|
||||
|
||||
def dump_map(self):
|
||||
for mapid in self.map:
|
||||
buf = self.map[mapid]
|
||||
print('mapid %d: phys 0x%x, iova 0x%x, size 0x%x'
|
||||
% (buf.mapid, buf.phys, buf.iova, buf.size))
|
||||
|
||||
|
||||
class ANEEngineReq:
|
||||
def __init__(self, anec):
|
||||
self.anec = anec
|
||||
self.td_size = 0
|
||||
self.td_count = 0
|
||||
self.fifo_iova = 0
|
||||
self.nid = 0
|
||||
self.qid = 0
|
||||
self.bar = [0x0] * 0x20
|
||||
|
||||
|
||||
class ANEFirmware:
|
||||
|
||||
FIFO_NID = 0x40
|
||||
FIFO_COUNT = 0x20
|
||||
FIFO_WIDTH = 0x400 # nextpow2(0x274)
|
||||
|
||||
def __init__(self, ane):
|
||||
self.ane = ane
|
||||
|
||||
def setup(self, anec):
|
||||
req = ANEEngineReq(anec)
|
||||
req.td_size = anec.td_size
|
||||
req.td_count = anec.td_count
|
||||
|
||||
# setup immutable bar
|
||||
tsk_buf = anec.data[anec.tsk_start:anec.tsk_start+anec.tsk_size]
|
||||
req.bar[0] = self.ane.allocator.alloc_data(tsk_buf)
|
||||
krn_start = anec.tsk_start + self.ane.round_up(anec.tsk_size, 0x10)
|
||||
krn_buf = anec.data[krn_start:krn_start+anec.krn_size]
|
||||
req.bar[1] = self.ane.allocator.alloc_data(krn_buf)
|
||||
|
||||
# setup mutable bar
|
||||
for bdx in range(0x20):
|
||||
if ((anec.tiles[bdx]) and (bdx >= 3)):
|
||||
size = anec.tiles[bdx] * self.ane.TILE_SIZE
|
||||
req.bar[bdx] = self.ane.allocator.alloc_size(size)
|
||||
|
||||
self.make_fifo(req)
|
||||
return req
|
||||
|
||||
def make_fifo(self, req):
|
||||
anec = req.anec
|
||||
pool_size = self.ane.round_up(self.FIFO_WIDTH * 2, self.ane.TILE_SIZE)
|
||||
fifo_iova = self.ane.allocator.alloc_size(pool_size)
|
||||
|
||||
td_buf = anec.data[anec.tsk_start:anec.tsk_start+anec.td_size]
|
||||
fifo_head = self.set_nid(td_buf, self.FIFO_NID)
|
||||
fifo_tail = self.set_nid(td_buf, self.FIFO_NID + self.FIFO_COUNT)
|
||||
self.ane.iowrite(fifo_iova, fifo_head)
|
||||
self.ane.iowrite(fifo_iova + self.FIFO_WIDTH, fifo_tail)
|
||||
|
||||
req.fifo_iova = fifo_iova
|
||||
req.nid = self.FIFO_NID
|
||||
req.qid = 4 # just the default queue
|
||||
|
||||
def set_nid(self, td_buf, nid):
|
||||
hdr0 = struct.unpack('<L', td_buf[:4])[0]
|
||||
hdr0 = (hdr0 & 0xf00ffff) | ((nid & 0xff) << 16)
|
||||
return struct.pack('<L', hdr0) + td_buf[4:]
|
||||
|
||||
def send_src(self, req, src_buf, idx):
|
||||
iova = req.bar[4 + req.anec.dst_count + idx]
|
||||
size = req.anec.src_sizes[idx]
|
||||
if (len(src_buf) < size):
|
||||
src_buf += b''*(size - len(src_buf))
|
||||
self.ane.iowrite(iova, src_buf[:size])
|
||||
|
||||
def read_dst(self, req, idx):
|
||||
iova = req.bar[4 + idx]
|
||||
size = req.anec.dst_sizes[idx]
|
||||
return self.ane.ioread(iova, size)
|
197
proxyclient/m1n1/hw/ane.py
Normal file
197
proxyclient/m1n1/hw/ane.py
Normal file
|
@ -0,0 +1,197 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
from ..utils import *
|
||||
from .dart import DARTRegs
|
||||
import time
|
||||
|
||||
|
||||
class ANERegs(RegMap):
|
||||
PMGR1 = 0x738, Register32
|
||||
PMGR2 = 0x798, Register32
|
||||
PMGR3 = 0x7f8, Register32
|
||||
|
||||
ASC_IO_RVBAR = 0x1050000, Register32
|
||||
ASC_EDPRCR = 0x1010310, Register32
|
||||
|
||||
# 24hz clocks, counter at +4
|
||||
CLK0 = 0x1160008, Register32
|
||||
CTR0 = 0x116000c, Register32
|
||||
CLK1 = 0x1168008, Register32
|
||||
CTR1 = 0x116800c, Register32
|
||||
CLK2 = 0x1170000, Register32
|
||||
CTR2 = 0x1170004, Register32
|
||||
CLK3 = 0x1178000, Register32
|
||||
CTR3 = 0x1178004, Register32
|
||||
|
||||
VERS = 0x1840000, Register32
|
||||
|
||||
# for acks w/ rtkit
|
||||
GPIO0 = 0x1840048, Register32
|
||||
GPIO1 = 0x184004c, Register32
|
||||
GPIO2 = 0x1840050, Register32
|
||||
GPIO3 = 0x1840054, Register32
|
||||
GPIO4 = 0x1840058, Register32
|
||||
GPIO5 = 0x184005c, Register32
|
||||
GPIO6 = 0x1840060, Register32
|
||||
GPIO7 = 0x1840064, Register32
|
||||
|
||||
|
||||
class ANEDARTRegs(DARTRegs):
|
||||
UNK_CONFIG_68 = 0x68, Register32
|
||||
UNK_CONFIG_6c = 0x6c, Register32
|
||||
|
||||
|
||||
class R_TQINFO(Register32):
|
||||
UNK = 31, 16
|
||||
NID = 15, 0
|
||||
|
||||
class TaskQueue(RegMap):
|
||||
STATUS = irange(0x00, 8, 0x148), Register32
|
||||
PRTY = irange(0x10, 8, 0x148), Register32
|
||||
FREE_SPACE = irange(0x14, 8, 0x148), Register32
|
||||
TQINFO = irange(0x1c, 8, 0x148), R_TQINFO
|
||||
|
||||
BAR1 = (irange(0x20, 8, 0x148), irange(0x0, 0x20, 4)), Register32
|
||||
REQ_NID1 = irange(0xa0, 8, 0x148), Register32
|
||||
REQ_SIZE2 = irange(0xa4, 8, 0x148), Register32
|
||||
REQ_ADDR2 = irange(0xa8, 8, 0x148), Register32
|
||||
|
||||
BAR2 = (irange(0xac, 8, 0x148), irange(0x0, 0x20, 4)), Register32
|
||||
REQ_NID2 = irange(0x12c, 8, 0x148), Register32
|
||||
REQ_SIZE1 = irange(0x130, 8, 0x148), Register32
|
||||
REQ_ADDR1 = irange(0x134, 8, 0x148), Register32
|
||||
|
||||
|
||||
class R_REQINFO(Register32):
|
||||
TDSIZE = 31, 16
|
||||
TDCOUNT = 15, 0
|
||||
|
||||
class R_IRQINFO(Register32):
|
||||
CNT = 31, 24
|
||||
NID = 23, 16
|
||||
UNK1 = 15, 8
|
||||
UNK2 = 7, 0
|
||||
|
||||
class TMRegs(RegMap):
|
||||
REQ_ADDR = 0x0, Register32
|
||||
REQ_INFO = 0x4, R_REQINFO
|
||||
REQ_PUSH = 0x8, Register32
|
||||
TQ_EN = 0xc, Register32
|
||||
|
||||
IRQ_EVT1_CNT = 0x14, Register32
|
||||
IRQ_EVT1_DAT_INFO = 0x18, R_IRQINFO
|
||||
IRQ_EVT1_DAT_UNK1 = 0x1c, Register32
|
||||
IRQ_EVT1_DAT_TIME = 0x20, Register32
|
||||
IRQ_EVT1_DAT_UNK2 = 0x24, Register32
|
||||
|
||||
IRQ_EVT2_CNT = 0x28, Register32
|
||||
IRQ_EVT2_DAT_INFO = 0x2c, R_IRQINFO
|
||||
IRQ_EVT2_DAT_UNK1 = 0x30, Register32
|
||||
IRQ_EVT2_DAT_TIME = 0x34, Register32
|
||||
IRQ_EVT2_DAT_UNK2 = 0x38, Register32
|
||||
|
||||
COMMIT_INFO = 0x44, Register32
|
||||
TM_STATUS = 0x54, Register32
|
||||
|
||||
UNK_IRQ_EN1 = 0x68, Register32
|
||||
UNK_IRQ_ACK = 0x6c, Register32
|
||||
UNK_IRQ_EN2 = 0x70, Register32
|
||||
|
||||
|
||||
class ANETaskManager:
|
||||
|
||||
TQ_COUNT = 8
|
||||
TQ_WIDTH = 0x148
|
||||
tq_prty = (0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x1e, 0x1f)
|
||||
|
||||
def __init__(self, ane):
|
||||
self.u = ane.u
|
||||
self.p = ane.p
|
||||
self.TM_BASE_ADDR = ane.base_addr + 0x1c00000 + 0x24000
|
||||
self.TQ_BASE_ADDR = ane.base_addr + 0x1c00000 + 0x25000
|
||||
self.regs = TMRegs(self.u, self.TM_BASE_ADDR)
|
||||
self.tq = TaskQueue(self.u, self.TQ_BASE_ADDR)
|
||||
|
||||
def reset(self): # these reset with ANE_SET pds
|
||||
self.regs.TQ_EN.val = 0x3000
|
||||
|
||||
# set priority param for each queue
|
||||
for qid, prty in enumerate(self.tq_prty):
|
||||
self.tq.PRTY[qid].val = self.tq_prty[qid]
|
||||
|
||||
self.regs.UNK_IRQ_EN1.val = 0x4000000 # enable irq
|
||||
self.regs.UNK_IRQ_EN2.val = 0x6 # enable irq
|
||||
|
||||
def enqueue_tq(self, req):
|
||||
qid = req.qid
|
||||
if not ((qid >= 1) and (qid < self.TQ_COUNT)):
|
||||
raise ValueError('1 <= qid <= 7')
|
||||
if not (self.tq.PRTY[qid].val == self.tq_prty[qid]):
|
||||
raise ValueError('invalid priority setup for tq %d' % qid)
|
||||
|
||||
print('enqueueing task w/ fifo 0x%x to tq %d' % (req.fifo_iova, qid))
|
||||
self.tq.STATUS[qid].val = 0x1 # in use
|
||||
|
||||
for bdx, iova in enumerate(req.bar):
|
||||
if (iova):
|
||||
print("bar %d: 0x%x" % (bdx, iova))
|
||||
self.tq.BAR1[qid, bdx].val = iova
|
||||
|
||||
self.tq.REQ_SIZE1[qid].val = ((req.td_size << 0xe) + 0x1ff0000) & 0x1ff0000
|
||||
self.tq.REQ_ADDR1[qid].val = req.fifo_iova & 0xffffffff
|
||||
self.tq.REQ_NID1[qid].val = (req.nid & 0xff) << 8 | 1
|
||||
|
||||
def execute_tq(self, req):
|
||||
qid = req.qid
|
||||
print('arbitered tq %d; pushing to execution queue...' % qid)
|
||||
|
||||
# transfer to main queue (now in in TM range)
|
||||
self.regs.REQ_ADDR.val = self.tq.REQ_ADDR1[qid].val
|
||||
# doesnt go through if 0
|
||||
self.regs.REQ_INFO.val = self.tq.REQ_SIZE1[qid].val | req.td_count
|
||||
# let's do magic
|
||||
self.regs.REQ_PUSH.val = self.tq_prty[qid] | (qid & 7) << 8
|
||||
|
||||
self.get_tm_status()
|
||||
self.get_committed_info()
|
||||
self.irq_handler()
|
||||
self.tq.STATUS[qid].val = 0x0 # done
|
||||
|
||||
def get_tm_status(self, max_timeouts=100, interval=0.01):
|
||||
for n in range(max_timeouts):
|
||||
status = self.regs.TM_STATUS.val
|
||||
success = (status & 1) != 0
|
||||
print('tm status: 0x%x, success: %r' % (status, success))
|
||||
if (success):
|
||||
return success
|
||||
time.sleep(interval)
|
||||
print('timeout, tm is non-idle! status: 0x%x' % status)
|
||||
return success
|
||||
|
||||
def get_committed_info(self):
|
||||
committed_nid = self.regs.COMMIT_INFO.val >> 0x10 & 0xff
|
||||
print('pushed td w/ nid 0x%x to execution' % committed_nid)
|
||||
|
||||
def irq_handler(self):
|
||||
line = 0
|
||||
evtcnt = self.regs.IRQ_EVT1_CNT.val
|
||||
print('irq handler: LINE %d EVTCNT: %d' % (line, evtcnt))
|
||||
for evt_n in range(evtcnt): # needs to be cleared
|
||||
info = self.regs.IRQ_EVT1_DAT_INFO.val
|
||||
unk1 = self.regs.IRQ_EVT1_DAT_UNK1.val
|
||||
tmstmp = self.regs.IRQ_EVT1_DAT_TIME.val
|
||||
unk2 = self.regs.IRQ_EVT1_DAT_UNK2.val
|
||||
print('irq handler: LINE %d EVT %d: executed info 0x%x @ 0x%x'
|
||||
% (line, evt_n, info, tmstmp))
|
||||
|
||||
self.regs.UNK_IRQ_ACK.val = self.regs.UNK_IRQ_ACK.val | 2
|
||||
|
||||
line = 1
|
||||
evtcnt = self.regs.IRQ_EVT2_CNT.val
|
||||
print('irq handler: LINE %d EVTCNT: %d' % (line, evtcnt))
|
||||
for evt_n in range(evtcnt): # needs to be cleared
|
||||
info = self.regs.IRQ_EVT2_DAT_INFO.val
|
||||
unk1 = self.regs.IRQ_EVT2_DAT_UNK1.val
|
||||
tmstmp = self.regs.IRQ_EVT2_DAT_TIME.val
|
||||
unk2 = self.regs.IRQ_EVT2_DAT_UNK2.val
|
||||
print('irq handler: LINE %d EVT %d: executed info 0x%x @ 0x%x'
|
||||
% (line, evt_n, info, tmstmp))
|
Loading…
Reference in a new issue