u-boot/drivers/spi/rk_spi.c

569 lines
15 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0+
/*
* spi driver for rockchip
*
rockchip: spi: add optimised receive-only implementation For the RK3399-Q7 we recommend storing SPL and u-boot.itb in the on-module 32MBit (and sometimes even larger, if requested as part of a configure-to-order configuration) SPI-NOR flash that is clocked for a bitrate of 49.5MBit/s and connected in a single-IO configuration (the RK3399 only supports single-IO for SPI). Unfortunately, the existing SPI driver is excruciatingly slow at reading out large chunks of data (in fact it is just as slow for small chunks of data, but the overheads of the driver-framework make it less noticeable): before this change, the throughput on a 4MB read from SPI-NOR is 8.47MBit/s which equates a 17.11% bus-utilisation. To improve on this, this commit adds an optimised receive-only transfer (i.e.: out == NULL) handler that hooks into the main transfer function and processes data in 16bit frames (utilising the full with of each FIFO element). As of now, the receive-only handler requires the in-buffer to be 16bit aligned. Any lingering data (i.e. either if the in-buffer was not 16-bit aligned or if an odd number of bytes are to be received) will be handled by the original 8bit reader/wirter. Given that the SPI controller's documentation does not guarantuee any interlocking between the RXFIFO and the master SCLK, the transfer loop will be restarted for each chunk of 32 frames (i.e. 64 bytes). With this new receive-only transfer handler, the throughput for a 4MB read increases to 36.28MBit/s (i.e. 73.29% bus-utilisation): this is a 4x improvement over the baseline. Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Reported-by: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Christoph Muellner <christoph.muellner@theobroma-systems.com>
2019-02-03 15:17:31 +00:00
* (C) 2019 Theobroma Systems Design und Consulting GmbH
*
* (C) Copyright 2015 Google, Inc
*
* (C) Copyright 2008-2013 Rockchip Electronics
* Peter, Software Engineering, <superpeter.cai@gmail.com>.
*/
#include <common.h>
#include <clk.h>
#include <dm.h>
#include <dt-structs.h>
#include <errno.h>
#include <log.h>
#include <spi.h>
#include <time.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <asm/io.h>
#include <asm/arch-rockchip/clock.h>
#include <asm/arch-rockchip/periph.h>
#include <dm/pinctrl.h>
#include "rk_spi.h"
/* Change to 1 to output registers at the start of each transaction */
#define DEBUG_RK_SPI 0
/*
* ctrlr1 is 16-bits, so we should support lengths of 0xffff + 1. However,
* the controller seems to hang when given 0x10000, so stick with this for now.
*/
#define ROCKCHIP_SPI_MAX_TRANLEN 0xffff
struct rockchip_spi_params {
/* RXFIFO overruns and TXFIFO underruns stop the master clock */
bool master_manages_fifo;
};
struct rockchip_spi_plat {
#if CONFIG_IS_ENABLED(OF_PLATDATA)
struct dtd_rockchip_rk3288_spi of_plat;
#endif
s32 frequency; /* Default clock frequency, -1 for none */
fdt_addr_t base;
uint deactivate_delay_us; /* Delay to wait after deactivate */
uint activate_delay_us; /* Delay to wait after activate */
};
struct rockchip_spi_priv {
struct rockchip_spi *regs;
struct clk clk;
unsigned int max_freq;
unsigned int mode;
ulong last_transaction_us; /* Time of last transaction end */
unsigned int speed_hz;
unsigned int last_speed_hz;
uint input_rate;
};
#define SPI_FIFO_DEPTH 32
static void rkspi_dump_regs(struct rockchip_spi *regs)
{
debug("ctrl0: \t\t0x%08x\n", readl(&regs->ctrlr0));
debug("ctrl1: \t\t0x%08x\n", readl(&regs->ctrlr1));
debug("ssienr: \t\t0x%08x\n", readl(&regs->enr));
debug("ser: \t\t0x%08x\n", readl(&regs->ser));
debug("baudr: \t\t0x%08x\n", readl(&regs->baudr));
debug("txftlr: \t\t0x%08x\n", readl(&regs->txftlr));
debug("rxftlr: \t\t0x%08x\n", readl(&regs->rxftlr));
debug("txflr: \t\t0x%08x\n", readl(&regs->txflr));
debug("rxflr: \t\t0x%08x\n", readl(&regs->rxflr));
debug("sr: \t\t0x%08x\n", readl(&regs->sr));
debug("imr: \t\t0x%08x\n", readl(&regs->imr));
debug("isr: \t\t0x%08x\n", readl(&regs->isr));
debug("dmacr: \t\t0x%08x\n", readl(&regs->dmacr));
debug("dmatdlr: \t0x%08x\n", readl(&regs->dmatdlr));
debug("dmardlr: \t0x%08x\n", readl(&regs->dmardlr));
}
static void rkspi_enable_chip(struct rockchip_spi *regs, bool enable)
{
writel(enable ? 1 : 0, &regs->enr);
}
static void rkspi_set_clk(struct rockchip_spi_priv *priv, uint speed)
{
rockchip: spi: rewrite rkspi_set_clk for a more conservative baudrate setting The baudrate in rkspi was calculated by using an integer division (which implicitly discarded any fractional result), then rounding to an even number and finally clamping to 0xfffe using a bitwise AND operator. This introduced two issues: 1) for very small baudrates (overflowing the 0xfffe range), the bitwise-AND generates rather random-looking (wildly varying) actual output bitrates 2) for higher baudrates, the calculation tends to 'err towards a higher baudrate' with the actual error increasing as the dividers become very small. E.g., with a 99MHz input clock, a request for a 20MBit baudrate (99/20 = 4.95), a 24.75 MBit would be use (which amounts to a 23.75% error)... for a 34 MBit request this would be an actual outbout of 49.5 Mbit (i.e. a 45% error). This change rewrites the divider selection (i.e. baudrate calculation) by making sure that a) for the normal case: the largest representable baudrate below the requested rate will be chosen; b) for the denormal case (i.e. when the divider can no longer be represented), the lowest representable baudrate is chosen. Even though the denormal case (b) may be of little concern in real world applications (even with a 198MHz input clock, this will only happen at below approx. 3kHz/3kBit), our board-verification team kept complaining. Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Tested-by: Klaus Goger <klaus.goger@theobroma-systems.com>
2017-04-20 20:05:52 +00:00
/*
* We should try not to exceed the speed requested by the caller:
* when selecting a divider, we need to make sure we round up.
*/
uint clk_div = DIV_ROUND_UP(priv->input_rate, speed);
/* The baudrate register (BAUDR) is defined as a 32bit register where
* the upper 16bit are reserved and having 'Fsclk_out' in the lower
* 16bits with 'Fsclk_out' defined as follows:
*
* Fsclk_out = Fspi_clk/ SCKDV
* Where SCKDV is any even value between 2 and 65534.
*/
if (clk_div > 0xfffe) {
clk_div = 0xfffe;
debug("%s: can't divide down to %d Hz (actual will be %d Hz)\n",
rockchip: spi: rewrite rkspi_set_clk for a more conservative baudrate setting The baudrate in rkspi was calculated by using an integer division (which implicitly discarded any fractional result), then rounding to an even number and finally clamping to 0xfffe using a bitwise AND operator. This introduced two issues: 1) for very small baudrates (overflowing the 0xfffe range), the bitwise-AND generates rather random-looking (wildly varying) actual output bitrates 2) for higher baudrates, the calculation tends to 'err towards a higher baudrate' with the actual error increasing as the dividers become very small. E.g., with a 99MHz input clock, a request for a 20MBit baudrate (99/20 = 4.95), a 24.75 MBit would be use (which amounts to a 23.75% error)... for a 34 MBit request this would be an actual outbout of 49.5 Mbit (i.e. a 45% error). This change rewrites the divider selection (i.e. baudrate calculation) by making sure that a) for the normal case: the largest representable baudrate below the requested rate will be chosen; b) for the denormal case (i.e. when the divider can no longer be represented), the lowest representable baudrate is chosen. Even though the denormal case (b) may be of little concern in real world applications (even with a 198MHz input clock, this will only happen at below approx. 3kHz/3kBit), our board-verification team kept complaining. Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Tested-by: Klaus Goger <klaus.goger@theobroma-systems.com>
2017-04-20 20:05:52 +00:00
__func__, speed, priv->input_rate / clk_div);
}
/* Round up to the next even 16bit number */
clk_div = (clk_div + 1) & 0xfffe;
debug("spi speed %u, div %u\n", speed, clk_div);
rockchip: spi: rewrite rkspi_set_clk for a more conservative baudrate setting The baudrate in rkspi was calculated by using an integer division (which implicitly discarded any fractional result), then rounding to an even number and finally clamping to 0xfffe using a bitwise AND operator. This introduced two issues: 1) for very small baudrates (overflowing the 0xfffe range), the bitwise-AND generates rather random-looking (wildly varying) actual output bitrates 2) for higher baudrates, the calculation tends to 'err towards a higher baudrate' with the actual error increasing as the dividers become very small. E.g., with a 99MHz input clock, a request for a 20MBit baudrate (99/20 = 4.95), a 24.75 MBit would be use (which amounts to a 23.75% error)... for a 34 MBit request this would be an actual outbout of 49.5 Mbit (i.e. a 45% error). This change rewrites the divider selection (i.e. baudrate calculation) by making sure that a) for the normal case: the largest representable baudrate below the requested rate will be chosen; b) for the denormal case (i.e. when the divider can no longer be represented), the lowest representable baudrate is chosen. Even though the denormal case (b) may be of little concern in real world applications (even with a 198MHz input clock, this will only happen at below approx. 3kHz/3kBit), our board-verification team kept complaining. Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Tested-by: Klaus Goger <klaus.goger@theobroma-systems.com>
2017-04-20 20:05:52 +00:00
clrsetbits_le32(&priv->regs->baudr, 0xffff, clk_div);
priv->last_speed_hz = speed;
}
static int rkspi_wait_till_not_busy(struct rockchip_spi *regs)
{
unsigned long start;
start = get_timer(0);
while (readl(&regs->sr) & SR_BUSY) {
if (get_timer(start) > ROCKCHIP_SPI_TIMEOUT_MS) {
debug("RK SPI: Status keeps busy for 1000us after a read/write!\n");
return -ETIMEDOUT;
}
}
return 0;
}
static void spi_cs_activate(struct udevice *dev, uint cs)
{
struct udevice *bus = dev->parent;
struct rockchip_spi_plat *plat = dev_get_plat(bus);
struct rockchip_spi_priv *priv = dev_get_priv(bus);
struct rockchip_spi *regs = priv->regs;
/* If it's too soon to do another transaction, wait */
if (plat->deactivate_delay_us && priv->last_transaction_us) {
ulong delay_us; /* The delay completed so far */
delay_us = timer_get_us() - priv->last_transaction_us;
if (delay_us < plat->deactivate_delay_us) {
ulong additional_delay_us =
plat->deactivate_delay_us - delay_us;
debug("%s: delaying by %ld us\n",
__func__, additional_delay_us);
udelay(additional_delay_us);
}
}
debug("activate cs%u\n", cs);
writel(1 << cs, &regs->ser);
if (plat->activate_delay_us)
udelay(plat->activate_delay_us);
}
static void spi_cs_deactivate(struct udevice *dev, uint cs)
{
struct udevice *bus = dev->parent;
struct rockchip_spi_plat *plat = dev_get_plat(bus);
struct rockchip_spi_priv *priv = dev_get_priv(bus);
struct rockchip_spi *regs = priv->regs;
debug("deactivate cs%u\n", cs);
writel(0, &regs->ser);
/* Remember time of this transaction so we can honour the bus delay */
if (plat->deactivate_delay_us)
priv->last_transaction_us = timer_get_us();
}
#if CONFIG_IS_ENABLED(OF_PLATDATA)
static int conv_of_plat(struct udevice *dev)
{
struct rockchip_spi_plat *plat = dev_get_plat(dev);
struct dtd_rockchip_rk3288_spi *dtplat = &plat->of_plat;
struct rockchip_spi_priv *priv = dev_get_priv(dev);
int ret;
plat->base = dtplat->reg[0];
plat->frequency = 20000000;
ret = clk_get_by_phandle(dev, dtplat->clocks, &priv->clk);
if (ret < 0)
return ret;
return 0;
}
#endif
static int rockchip_spi_of_to_plat(struct udevice *bus)
{
struct rockchip_spi_plat *plat = dev_get_plat(bus);
struct rockchip_spi_priv *priv = dev_get_priv(bus);
int ret;
if (CONFIG_IS_ENABLED(OF_REAL)) {
plat->base = dev_read_addr(bus);
ret = clk_get_by_index(bus, 0, &priv->clk);
if (ret < 0) {
debug("%s: Could not get clock for %s: %d\n", __func__,
bus->name, ret);
return ret;
}
plat->frequency = dev_read_u32_default(bus, "spi-max-frequency",
50000000);
plat->deactivate_delay_us =
dev_read_u32_default(bus, "spi-deactivate-delay", 0);
plat->activate_delay_us =
dev_read_u32_default(bus, "spi-activate-delay", 0);
debug("%s: base=%x, max-frequency=%d, deactivate_delay=%d\n",
__func__, (uint)plat->base, plat->frequency,
plat->deactivate_delay_us);
}
return 0;
}
static int rockchip_spi_calc_modclk(ulong max_freq)
{
/*
* While this is not strictly correct for the RK3368, as the
* GPLL will be 576MHz, things will still work, as the
* clk_set_rate(...) implementation in our clock-driver will
* chose the next closest rate not exceeding what we request
* based on the output of this function.
*/
unsigned div;
const unsigned long gpll_hz = 594000000UL;
/*
* We need to find an input clock that provides at least twice
* the maximum frequency and can be generated from the assumed
* speed of GPLL (594MHz) using an integer divider.
*
* To give us more achievable bitrates at higher speeds (these
* are generated by dividing by an even 16-bit integer from
* this frequency), we try to have an input frequency of at
* least 4x our max_freq.
*/
div = DIV_ROUND_UP(gpll_hz, max_freq * 4);
return gpll_hz / div;
}
static int rockchip_spi_probe(struct udevice *bus)
{
struct rockchip_spi_plat *plat = dev_get_plat(bus);
struct rockchip_spi_priv *priv = dev_get_priv(bus);
int ret;
debug("%s: probe\n", __func__);
#if CONFIG_IS_ENABLED(OF_PLATDATA)
ret = conv_of_plat(bus);
if (ret)
return ret;
#endif
priv->regs = (struct rockchip_spi *)plat->base;
priv->last_transaction_us = timer_get_us();
priv->max_freq = plat->frequency;
/* Clamp the value from the DTS against any hardware limits */
if (priv->max_freq > ROCKCHIP_SPI_MAX_RATE)
priv->max_freq = ROCKCHIP_SPI_MAX_RATE;
/* Find a module-input clock that fits with the max_freq setting */
ret = clk_set_rate(&priv->clk,
rockchip_spi_calc_modclk(priv->max_freq));
if (ret < 0) {
debug("%s: Failed to set clock: %d\n", __func__, ret);
return ret;
}
priv->input_rate = ret;
debug("%s: rate = %u\n", __func__, priv->input_rate);
return 0;
}
static int rockchip_spi_claim_bus(struct udevice *dev)
{
struct udevice *bus = dev->parent;
struct rockchip_spi_priv *priv = dev_get_priv(bus);
struct rockchip_spi *regs = priv->regs;
uint ctrlr0;
/* Disable the SPI hardware */
rkspi_enable_chip(regs, false);
if (priv->speed_hz != priv->last_speed_hz)
rkspi_set_clk(priv, priv->speed_hz);
/* Operation Mode */
ctrlr0 = OMOD_MASTER << OMOD_SHIFT;
/* Data Frame Size */
ctrlr0 |= DFS_8BIT << DFS_SHIFT;
/* set SPI mode 0..3 */
if (priv->mode & SPI_CPOL)
ctrlr0 |= SCOL_HIGH << SCOL_SHIFT;
if (priv->mode & SPI_CPHA)
ctrlr0 |= SCPH_TOGSTA << SCPH_SHIFT;
/* Chip Select Mode */
ctrlr0 |= CSM_KEEP << CSM_SHIFT;
/* SSN to Sclk_out delay */
ctrlr0 |= SSN_DELAY_ONE << SSN_DELAY_SHIFT;
/* Serial Endian Mode */
ctrlr0 |= SEM_LITTLE << SEM_SHIFT;
/* First Bit Mode */
ctrlr0 |= FBM_MSB << FBM_SHIFT;
/* Byte and Halfword Transform */
ctrlr0 |= HALF_WORD_OFF << HALF_WORD_TX_SHIFT;
/* Rxd Sample Delay */
ctrlr0 |= 0 << RXDSD_SHIFT;
/* Frame Format */
ctrlr0 |= FRF_SPI << FRF_SHIFT;
/* Tx and Rx mode */
ctrlr0 |= TMOD_TR << TMOD_SHIFT;
writel(ctrlr0, &regs->ctrlr0);
return 0;
}
static int rockchip_spi_release_bus(struct udevice *dev)
{
struct udevice *bus = dev->parent;
struct rockchip_spi_priv *priv = dev_get_priv(bus);
rkspi_enable_chip(priv->regs, false);
return 0;
}
rockchip: spi: add optimised receive-only implementation For the RK3399-Q7 we recommend storing SPL and u-boot.itb in the on-module 32MBit (and sometimes even larger, if requested as part of a configure-to-order configuration) SPI-NOR flash that is clocked for a bitrate of 49.5MBit/s and connected in a single-IO configuration (the RK3399 only supports single-IO for SPI). Unfortunately, the existing SPI driver is excruciatingly slow at reading out large chunks of data (in fact it is just as slow for small chunks of data, but the overheads of the driver-framework make it less noticeable): before this change, the throughput on a 4MB read from SPI-NOR is 8.47MBit/s which equates a 17.11% bus-utilisation. To improve on this, this commit adds an optimised receive-only transfer (i.e.: out == NULL) handler that hooks into the main transfer function and processes data in 16bit frames (utilising the full with of each FIFO element). As of now, the receive-only handler requires the in-buffer to be 16bit aligned. Any lingering data (i.e. either if the in-buffer was not 16-bit aligned or if an odd number of bytes are to be received) will be handled by the original 8bit reader/wirter. Given that the SPI controller's documentation does not guarantuee any interlocking between the RXFIFO and the master SCLK, the transfer loop will be restarted for each chunk of 32 frames (i.e. 64 bytes). With this new receive-only transfer handler, the throughput for a 4MB read increases to 36.28MBit/s (i.e. 73.29% bus-utilisation): this is a 4x improvement over the baseline. Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Reported-by: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Christoph Muellner <christoph.muellner@theobroma-systems.com>
2019-02-03 15:17:31 +00:00
static inline int rockchip_spi_16bit_reader(struct udevice *dev,
u8 **din, int *len)
{
struct udevice *bus = dev->parent;
const struct rockchip_spi_params * const data =
(void *)dev_get_driver_data(bus);
struct rockchip_spi_priv *priv = dev_get_priv(bus);
struct rockchip_spi *regs = priv->regs;
const u32 saved_ctrlr0 = readl(&regs->ctrlr0);
#if defined(DEBUG)
u32 statistics_rxlevels[33] = { };
#endif
u32 frames = *len / 2;
u8 *in = (u8 *)(*din);
rockchip: spi: add optimised receive-only implementation For the RK3399-Q7 we recommend storing SPL and u-boot.itb in the on-module 32MBit (and sometimes even larger, if requested as part of a configure-to-order configuration) SPI-NOR flash that is clocked for a bitrate of 49.5MBit/s and connected in a single-IO configuration (the RK3399 only supports single-IO for SPI). Unfortunately, the existing SPI driver is excruciatingly slow at reading out large chunks of data (in fact it is just as slow for small chunks of data, but the overheads of the driver-framework make it less noticeable): before this change, the throughput on a 4MB read from SPI-NOR is 8.47MBit/s which equates a 17.11% bus-utilisation. To improve on this, this commit adds an optimised receive-only transfer (i.e.: out == NULL) handler that hooks into the main transfer function and processes data in 16bit frames (utilising the full with of each FIFO element). As of now, the receive-only handler requires the in-buffer to be 16bit aligned. Any lingering data (i.e. either if the in-buffer was not 16-bit aligned or if an odd number of bytes are to be received) will be handled by the original 8bit reader/wirter. Given that the SPI controller's documentation does not guarantuee any interlocking between the RXFIFO and the master SCLK, the transfer loop will be restarted for each chunk of 32 frames (i.e. 64 bytes). With this new receive-only transfer handler, the throughput for a 4MB read increases to 36.28MBit/s (i.e. 73.29% bus-utilisation): this is a 4x improvement over the baseline. Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Reported-by: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Christoph Muellner <christoph.muellner@theobroma-systems.com>
2019-02-03 15:17:31 +00:00
u32 max_chunk_size = SPI_FIFO_DEPTH;
if (!frames)
return 0;
/*
* If we know that the hardware will manage RXFIFO overruns
* (i.e. stop the SPI clock until there's space in the FIFO),
* we the allow largest possible chunk size that can be
* represented in CTRLR1.
*/
if (data && data->master_manages_fifo)
max_chunk_size = ROCKCHIP_SPI_MAX_TRANLEN;
rockchip: spi: add optimised receive-only implementation For the RK3399-Q7 we recommend storing SPL and u-boot.itb in the on-module 32MBit (and sometimes even larger, if requested as part of a configure-to-order configuration) SPI-NOR flash that is clocked for a bitrate of 49.5MBit/s and connected in a single-IO configuration (the RK3399 only supports single-IO for SPI). Unfortunately, the existing SPI driver is excruciatingly slow at reading out large chunks of data (in fact it is just as slow for small chunks of data, but the overheads of the driver-framework make it less noticeable): before this change, the throughput on a 4MB read from SPI-NOR is 8.47MBit/s which equates a 17.11% bus-utilisation. To improve on this, this commit adds an optimised receive-only transfer (i.e.: out == NULL) handler that hooks into the main transfer function and processes data in 16bit frames (utilising the full with of each FIFO element). As of now, the receive-only handler requires the in-buffer to be 16bit aligned. Any lingering data (i.e. either if the in-buffer was not 16-bit aligned or if an odd number of bytes are to be received) will be handled by the original 8bit reader/wirter. Given that the SPI controller's documentation does not guarantuee any interlocking between the RXFIFO and the master SCLK, the transfer loop will be restarted for each chunk of 32 frames (i.e. 64 bytes). With this new receive-only transfer handler, the throughput for a 4MB read increases to 36.28MBit/s (i.e. 73.29% bus-utilisation): this is a 4x improvement over the baseline. Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Reported-by: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Christoph Muellner <christoph.muellner@theobroma-systems.com>
2019-02-03 15:17:31 +00:00
// rockchip_spi_configure(dev, mode, size)
rkspi_enable_chip(regs, false);
clrsetbits_le32(&regs->ctrlr0,
TMOD_MASK << TMOD_SHIFT,
TMOD_RO << TMOD_SHIFT);
/* 16bit data frame size */
clrsetbits_le32(&regs->ctrlr0, DFS_MASK, DFS_16BIT);
/* Update caller's context */
const u32 bytes_to_process = 2 * frames;
*din += bytes_to_process;
*len -= bytes_to_process;
/* Process our frames */
while (frames) {
u32 chunk_size = min(frames, max_chunk_size);
frames -= chunk_size;
writew(chunk_size - 1, &regs->ctrlr1);
rkspi_enable_chip(regs, true);
do {
u32 rx_level = readw(&regs->rxflr);
#if defined(DEBUG)
statistics_rxlevels[rx_level]++;
#endif
chunk_size -= rx_level;
while (rx_level--) {
u16 val = readw(regs->rxdr);
*in++ = val & 0xff;
*in++ = val >> 8;
}
rockchip: spi: add optimised receive-only implementation For the RK3399-Q7 we recommend storing SPL and u-boot.itb in the on-module 32MBit (and sometimes even larger, if requested as part of a configure-to-order configuration) SPI-NOR flash that is clocked for a bitrate of 49.5MBit/s and connected in a single-IO configuration (the RK3399 only supports single-IO for SPI). Unfortunately, the existing SPI driver is excruciatingly slow at reading out large chunks of data (in fact it is just as slow for small chunks of data, but the overheads of the driver-framework make it less noticeable): before this change, the throughput on a 4MB read from SPI-NOR is 8.47MBit/s which equates a 17.11% bus-utilisation. To improve on this, this commit adds an optimised receive-only transfer (i.e.: out == NULL) handler that hooks into the main transfer function and processes data in 16bit frames (utilising the full with of each FIFO element). As of now, the receive-only handler requires the in-buffer to be 16bit aligned. Any lingering data (i.e. either if the in-buffer was not 16-bit aligned or if an odd number of bytes are to be received) will be handled by the original 8bit reader/wirter. Given that the SPI controller's documentation does not guarantuee any interlocking between the RXFIFO and the master SCLK, the transfer loop will be restarted for each chunk of 32 frames (i.e. 64 bytes). With this new receive-only transfer handler, the throughput for a 4MB read increases to 36.28MBit/s (i.e. 73.29% bus-utilisation): this is a 4x improvement over the baseline. Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Reported-by: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Christoph Muellner <christoph.muellner@theobroma-systems.com>
2019-02-03 15:17:31 +00:00
} while (chunk_size);
rkspi_enable_chip(regs, false);
}
#if defined(DEBUG)
debug("%s: observed rx_level during processing:\n", __func__);
for (int i = 0; i <= 32; ++i)
if (statistics_rxlevels[i])
debug("\t%2d: %d\n", i, statistics_rxlevels[i]);
#endif
/* Restore the original transfer setup and return error-free. */
writel(saved_ctrlr0, &regs->ctrlr0);
return 0;
}
static int rockchip_spi_xfer(struct udevice *dev, unsigned int bitlen,
const void *dout, void *din, unsigned long flags)
{
struct udevice *bus = dev->parent;
struct rockchip_spi_priv *priv = dev_get_priv(bus);
struct rockchip_spi *regs = priv->regs;
struct dm_spi_slave_plat *slave_plat = dev_get_parent_plat(dev);
int len = bitlen >> 3;
const u8 *out = dout;
u8 *in = din;
int toread, towrite;
rockchip: spi: add optimised receive-only implementation For the RK3399-Q7 we recommend storing SPL and u-boot.itb in the on-module 32MBit (and sometimes even larger, if requested as part of a configure-to-order configuration) SPI-NOR flash that is clocked for a bitrate of 49.5MBit/s and connected in a single-IO configuration (the RK3399 only supports single-IO for SPI). Unfortunately, the existing SPI driver is excruciatingly slow at reading out large chunks of data (in fact it is just as slow for small chunks of data, but the overheads of the driver-framework make it less noticeable): before this change, the throughput on a 4MB read from SPI-NOR is 8.47MBit/s which equates a 17.11% bus-utilisation. To improve on this, this commit adds an optimised receive-only transfer (i.e.: out == NULL) handler that hooks into the main transfer function and processes data in 16bit frames (utilising the full with of each FIFO element). As of now, the receive-only handler requires the in-buffer to be 16bit aligned. Any lingering data (i.e. either if the in-buffer was not 16-bit aligned or if an odd number of bytes are to be received) will be handled by the original 8bit reader/wirter. Given that the SPI controller's documentation does not guarantuee any interlocking between the RXFIFO and the master SCLK, the transfer loop will be restarted for each chunk of 32 frames (i.e. 64 bytes). With this new receive-only transfer handler, the throughput for a 4MB read increases to 36.28MBit/s (i.e. 73.29% bus-utilisation): this is a 4x improvement over the baseline. Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Reported-by: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Christoph Muellner <christoph.muellner@theobroma-systems.com>
2019-02-03 15:17:31 +00:00
int ret = 0;
debug("%s: dout=%p, din=%p, len=%x, flags=%lx\n", __func__, dout, din,
len, flags);
if (DEBUG_RK_SPI)
rkspi_dump_regs(regs);
/* Assert CS before transfer */
if (flags & SPI_XFER_BEGIN)
spi_cs_activate(dev, slave_plat->cs);
rockchip: spi: add optimised receive-only implementation For the RK3399-Q7 we recommend storing SPL and u-boot.itb in the on-module 32MBit (and sometimes even larger, if requested as part of a configure-to-order configuration) SPI-NOR flash that is clocked for a bitrate of 49.5MBit/s and connected in a single-IO configuration (the RK3399 only supports single-IO for SPI). Unfortunately, the existing SPI driver is excruciatingly slow at reading out large chunks of data (in fact it is just as slow for small chunks of data, but the overheads of the driver-framework make it less noticeable): before this change, the throughput on a 4MB read from SPI-NOR is 8.47MBit/s which equates a 17.11% bus-utilisation. To improve on this, this commit adds an optimised receive-only transfer (i.e.: out == NULL) handler that hooks into the main transfer function and processes data in 16bit frames (utilising the full with of each FIFO element). As of now, the receive-only handler requires the in-buffer to be 16bit aligned. Any lingering data (i.e. either if the in-buffer was not 16-bit aligned or if an odd number of bytes are to be received) will be handled by the original 8bit reader/wirter. Given that the SPI controller's documentation does not guarantuee any interlocking between the RXFIFO and the master SCLK, the transfer loop will be restarted for each chunk of 32 frames (i.e. 64 bytes). With this new receive-only transfer handler, the throughput for a 4MB read increases to 36.28MBit/s (i.e. 73.29% bus-utilisation): this is a 4x improvement over the baseline. Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Reported-by: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Klaus Goger <klaus.goger@theobroma-systems.com> Series-Cc: Christoph Muellner <christoph.muellner@theobroma-systems.com>
2019-02-03 15:17:31 +00:00
/*
* To ensure fast loading of firmware images (e.g. full U-Boot
* stage, ATF, Linux kernel) from SPI flash, we optimise the
* case of read-only transfers by using the full 16bits of each
* FIFO element.
*/
if (!out)
ret = rockchip_spi_16bit_reader(dev, &in, &len);
/* This is the original 8bit reader/writer code */
while (len > 0) {
int todo = min(len, ROCKCHIP_SPI_MAX_TRANLEN);
rkspi_enable_chip(regs, false);
writel(todo - 1, &regs->ctrlr1);
rkspi_enable_chip(regs, true);
toread = todo;
towrite = todo;
while (toread || towrite) {
u32 status = readl(&regs->sr);
if (towrite && !(status & SR_TF_FULL)) {
writel(out ? *out++ : 0, regs->txdr);
towrite--;
}
if (toread && !(status & SR_RF_EMPT)) {
u32 byte = readl(regs->rxdr);
if (in)
*in++ = byte;
toread--;
}
}
/*
* In case that there's a transmit-component, we need to wait
* until the control goes idle before we can disable the SPI
* control logic (as this will implicitly flush the FIFOs).
*/
if (out) {
ret = rkspi_wait_till_not_busy(regs);
if (ret)
break;
}
len -= todo;
}
/* Deassert CS after transfer */
if (flags & SPI_XFER_END)
spi_cs_deactivate(dev, slave_plat->cs);
rkspi_enable_chip(regs, false);
return ret;
}
static int rockchip_spi_set_speed(struct udevice *bus, uint speed)
{
struct rockchip_spi_priv *priv = dev_get_priv(bus);
/* Clamp to the maximum frequency specified in the DTS */
if (speed > priv->max_freq)
speed = priv->max_freq;
priv->speed_hz = speed;
return 0;
}
static int rockchip_spi_set_mode(struct udevice *bus, uint mode)
{
struct rockchip_spi_priv *priv = dev_get_priv(bus);
priv->mode = mode;
return 0;
}
static const struct dm_spi_ops rockchip_spi_ops = {
.claim_bus = rockchip_spi_claim_bus,
.release_bus = rockchip_spi_release_bus,
.xfer = rockchip_spi_xfer,
.set_speed = rockchip_spi_set_speed,
.set_mode = rockchip_spi_set_mode,
/*
* cs_info is not needed, since we require all chip selects to be
* in the device tree explicitly
*/
};
const struct rockchip_spi_params rk3399_spi_params = {
.master_manages_fifo = true,
};
static const struct udevice_id rockchip_spi_ids[] = {
{ .compatible = "rockchip,rk3066-spi" },
{ .compatible = "rockchip,rk3288-spi" },
{ .compatible = "rockchip,rk3328-spi" },
{ .compatible = "rockchip,rk3368-spi",
.data = (ulong)&rk3399_spi_params },
{ .compatible = "rockchip,rk3399-spi",
.data = (ulong)&rk3399_spi_params },
{ }
};
U_BOOT_DRIVER(rockchip_rk3288_spi) = {
.name = "rockchip_rk3288_spi",
.id = UCLASS_SPI,
.of_match = rockchip_spi_ids,
.ops = &rockchip_spi_ops,
.of_to_plat = rockchip_spi_of_to_plat,
.plat_auto = sizeof(struct rockchip_spi_plat),
.priv_auto = sizeof(struct rockchip_spi_priv),
.probe = rockchip_spi_probe,
};
DM_DRIVER_ALIAS(rockchip_rk3288_spi, rockchip_rk3368_spi)