Merge branch '2019-05-05-master-imports'

- Various assorted fixes - btrfs zstd compression support - Enable hardware DDR levelling on am43xx platforms. - pl310 cache controller driver
2025-03-16 23:07:00 +00:00 · 2019-05-05 12:25:39 -04:00 · 2019-05-05 12:25:39 -04:00 · abad176da1
commit abad176da1
parent 86f578ee85 9337a08768
57 changed files with 9075 additions and 47 deletions
--- a/Documentation/devicetree/bindings/arm/l2c2x0.txt
+++ b/Documentation/devicetree/bindings/arm/l2c2x0.txt
@ -0,0 +1,114 @@
+* ARM L2 Cache Controller
+
+ARM cores often have a separate L2C210/L2C220/L2C310 (also known as PL210/PL220/
+PL310 and variants) based level 2 cache controller. All these various implementations
+of the L2 cache controller have compatible programming models (Note 1).
+Some of the properties that are just prefixed "cache-*" are taken from section
+3.7.3 of the Devicetree Specification which can be found at:
+https://www.devicetree.org/specifications/
+
+The ARM L2 cache representation in the device tree should be done as follows:
+
+Required properties:
+
+- compatible : should be one of:
+  "arm,pl310-cache"
+  "arm,l220-cache"
+  "arm,l210-cache"
+  "bcm,bcm11351-a2-pl310-cache": DEPRECATED by "brcm,bcm11351-a2-pl310-cache"
+  "brcm,bcm11351-a2-pl310-cache": For Broadcom bcm11351 chipset where an
+     offset needs to be added to the address before passing down to the L2
+     cache controller
+  "marvell,aurora-system-cache": Marvell Controller designed to be
+     compatible with the ARM one, with system cache mode (meaning
+     maintenance operations on L1 are broadcasted to the L2 and L2
+     performs the same operation).
+  "marvell,aurora-outer-cache": Marvell Controller designed to be
+     compatible with the ARM one with outer cache mode.
+  "marvell,tauros3-cache": Marvell Tauros3 cache controller, compatible
+     with arm,pl310-cache controller.
+- cache-unified : Specifies the cache is a unified cache.
+- cache-level : Should be set to 2 for a level 2 cache.
+- reg : Physical base address and size of cache controller's memory mapped
+  registers.
+
+Optional properties:
+
+- arm,data-latency : Cycles of latency for Data RAM accesses. Specifies 3 cells of
+  read, write and setup latencies. Minimum valid values are 1. Controllers
+  without setup latency control should use a value of 0.
+- arm,tag-latency : Cycles of latency for Tag RAM accesses. Specifies 3 cells of
+  read, write and setup latencies. Controllers without setup latency control
+  should use 0. Controllers without separate read and write Tag RAM latency
+  values should only use the first cell.
+- arm,dirty-latency : Cycles of latency for Dirty RAMs. This is a single cell.
+- arm,filter-ranges : <start length> Starting address and length of window to
+  filter. Addresses in the filter window are directed to the M1 port. Other
+  addresses will go to the M0 port.
+- arm,io-coherent : indicates that the system is operating in an hardware
+  I/O coherent mode. Valid only when the arm,pl310-cache compatible
+  string is used.
+- interrupts : 1 combined interrupt.
+- cache-size : specifies the size in bytes of the cache
+- cache-sets : specifies the number of associativity sets of the cache
+- cache-block-size : specifies the size in bytes of a cache block
+- cache-line-size : specifies the size in bytes of a line in the cache,
+  if this is not specified, the line size is assumed to be equal to the
+  cache block size
+- cache-id-part: cache id part number to be used if it is not present
+  on hardware
+- wt-override: If present then L2 is forced to Write through mode
+- arm,double-linefill : Override double linefill enable setting. Enable if
+  non-zero, disable if zero.
+- arm,double-linefill-incr : Override double linefill on INCR read. Enable
+  if non-zero, disable if zero.
+- arm,double-linefill-wrap : Override double linefill on WRAP read. Enable
+  if non-zero, disable if zero.
+- arm,prefetch-drop : Override prefetch drop enable setting. Enable if non-zero,
+  disable if zero.
+- arm,prefetch-offset : Override prefetch offset value. Valid values are
+  0-7, 15, 23, and 31.
+- arm,shared-override : The default behavior of the L220 or PL310 cache
+  controllers with respect to the shareable attribute is to transform "normal
+  memory non-cacheable transactions" into "cacheable no allocate" (for reads)
+  or "write through no write allocate" (for writes).
+  On systems where this may cause DMA buffer corruption, this property must be
+  specified to indicate that such transforms are precluded.
+- arm,parity-enable : enable parity checking on the L2 cache (L220 or PL310).
+- arm,parity-disable : disable parity checking on the L2 cache (L220 or PL310).
+- arm,outer-sync-disable : disable the outer sync operation on the L2 cache.
+  Some core tiles, especially ARM PB11MPCore have a faulty L220 cache that
+  will randomly hang unless outer sync operations are disabled.
+- prefetch-data : Data prefetch. Value: <0> (forcibly disable), <1>
+  (forcibly enable), property absent (retain settings set by firmware)
+- prefetch-instr : Instruction prefetch. Value: <0> (forcibly disable),
+  <1> (forcibly enable), property absent (retain settings set by
+  firmware)
+- arm,dynamic-clock-gating : L2 dynamic clock gating. Value: <0> (forcibly
+  disable), <1> (forcibly enable), property absent (OS specific behavior,
+  preferably retain firmware settings)
+- arm,standby-mode: L2 standby mode enable. Value <0> (forcibly disable),
+  <1> (forcibly enable), property absent (OS specific behavior,
+  preferably retain firmware settings)
+- arm,early-bresp-disable : Disable the CA9 optimization Early BRESP (PL310)
+- arm,full-line-zero-disable : Disable the CA9 optimization Full line of zero
+  write (PL310)
+
+Example:
+
+L2: cache-controller {
+        compatible = "arm,pl310-cache";
+        reg = <0xfff12000 0x1000>;
+        arm,data-latency = <1 1 1>;
+        arm,tag-latency = <2 2 2>;
+        arm,filter-ranges = <0x80000000 0x8000000>;
+        cache-unified;
+        cache-level = <2>;
+	interrupts = <45>;
+};
+
+Note 1: The description in this document doesn't apply to integrated L2
+	cache controllers as found in e.g. Cortex-A15/A7/A57/A53. These
+	integrated L2 controllers are assumed to be all preconfigured by
+	early secure boot code. Thus no need to deal with their configuration
+	in the kernel at all.
--- a/arch/Kconfig
+++ b/arch/Kconfig
@ -227,6 +227,15 @@ config SYS_CONFIG_NAME
 	  The header file include/configs/<CONFIG_SYS_CONFIG_NAME>.h
 	  should be included from include/config.h.

+config SYS_DISABLE_DCACHE_OPS
+	bool
+	help
+	 This option disables dcache flush and dcache invalidation
+	 operations. For example, on coherent systems where cache
+	 operatios are not required, enable this option to avoid them.
+	 Note that, its up to the individual architectures to implement
+	 this functionality.
+
 source "arch/arc/Kconfig"
 source "arch/arm/Kconfig"
 source "arch/m68k/Kconfig"
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@ -847,6 +847,7 @@ config ARCH_SOCFPGA
 	imply SYS_MMCSD_RAW_MODE_U_BOOT_USE_PARTITION_TYPE
 	imply SPL_SPI_FLASH_SUPPORT
 	imply SPL_SPI_SUPPORT
+	imply L2X0_CACHE

 config ARCH_SUNXI
 	bool "Support sunxi (Allwinner) SoCs"
--- a/arch/arm/cpu/armv8/cache_v8.c
+++ b/arch/arm/cpu/armv8/cache_v8.c
@ -443,6 +443,7 @@ inline void flush_dcache_all(void)
 		debug("flushing dcache successfully.\n");
 }

+#ifndef CONFIG_SYS_DISABLE_DCACHE_OPS
 /*
 * Invalidates range in all levels of D-cache/unified cache
 */
@ -458,6 +459,15 @@ void flush_dcache_range(unsigned long start, unsigned long stop)
 {
 	__asm_flush_dcache_range(start, stop);
 }
+#else
+void invalidate_dcache_range(unsigned long start, unsigned long stop)
+{
+}
+
+void flush_dcache_range(unsigned long start, unsigned long stop)
+{
+}
+#endif /* CONFIG_SYS_DISABLE_DCACHE_OPS */

 void dcache_enable(void)
 {
--- a/arch/arm/cpu/u-boot-spl.lds
+++ b/arch/arm/cpu/u-boot-spl.lds
@ -79,16 +79,16 @@ SECTIONS
 }

 #if defined(IMAGE_MAX_SIZE)
-ASSERT(__image_copy_end - __image_copy_start < (IMAGE_MAX_SIZE), \
+ASSERT(__image_copy_end - __image_copy_start <= (IMAGE_MAX_SIZE), \
 	"SPL image too big");
 #endif

 #if defined(CONFIG_SPL_BSS_MAX_SIZE)
-ASSERT(__bss_end - __bss_start < (CONFIG_SPL_BSS_MAX_SIZE), \
+ASSERT(__bss_end - __bss_start <= (CONFIG_SPL_BSS_MAX_SIZE), \
 	"SPL image BSS too big");
 #endif

 #if defined(CONFIG_SPL_MAX_FOOTPRINT)
-ASSERT(__bss_end - _start < (CONFIG_SPL_MAX_FOOTPRINT), \
+ASSERT(__bss_end - _start <= (CONFIG_SPL_MAX_FOOTPRINT), \
 	"SPL image plus BSS too big");
 #endif
--- a/arch/arm/include/asm/pl310.h
+++ b/arch/arm/include/asm/pl310.h
@ -18,6 +18,9 @@
 #define L310_SHARED_ATT_OVERRIDE_ENABLE		(1 << 22)
 #define L310_AUX_CTRL_DATA_PREFETCH_MASK	(1 << 28)
 #define L310_AUX_CTRL_INST_PREFETCH_MASK	(1 << 29)
+#define L310_LATENCY_CTRL_SETUP(n)		((n) << 0)
+#define L310_LATENCY_CTRL_RD(n)			((n) << 4)
+#define L310_LATENCY_CTRL_WR(n)			((n) << 8)

 #define L2X0_CACHE_ID_PART_MASK     (0xf << 6)
 #define L2X0_CACHE_ID_PART_L310     (3 << 6)
--- a/arch/arm/mach-at91/arm926ejs/u-boot-spl.lds
+++ b/arch/arm/mach-at91/arm926ejs/u-boot-spl.lds
@ -52,11 +52,11 @@ SECTIONS
 }

 #if defined(IMAGE_MAX_SIZE)
-ASSERT(__image_copy_end - __start < (IMAGE_MAX_SIZE), \
+ASSERT(__image_copy_end - __start <= (IMAGE_MAX_SIZE), \
 	"SPL image too big");
 #endif

 #if defined(CONFIG_SPL_BSS_MAX_SIZE)
-ASSERT(__bss_end - __bss_start < (CONFIG_SPL_BSS_MAX_SIZE), \
+ASSERT(__bss_end - __bss_start <= (CONFIG_SPL_BSS_MAX_SIZE), \
 	"SPL image BSS too big");
 #endif
--- a/arch/arm/mach-omap2/am33xx/board.c
+++ b/arch/arm/mach-omap2/am33xx/board.c
@ -38,6 +38,14 @@
 #include <asm/omap_musb.h>
 #include <asm/davinci_rtc.h>

+#define AM43XX_EMIF_BASE				0x4C000000
+#define AM43XX_SDRAM_CONFIG_OFFSET			0x8
+#define AM43XX_SDRAM_TYPE_MASK				0xE0000000
+#define AM43XX_SDRAM_TYPE_SHIFT				29
+#define AM43XX_SDRAM_TYPE_DDR3				3
+#define AM43XX_READ_WRITE_LEVELING_CTRL_OFFSET		0xDC
+#define AM43XX_RDWRLVLFULL_START			0x80000000
+
 DECLARE_GLOBAL_DATA_PTR;

 int dram_init(void)
@ -435,7 +443,7 @@ static void rtc_only(void)
 	struct prm_device_inst *prm_device =
 				(struct prm_device_inst *)PRM_DEVICE_INST;

-	u32 scratch1;
+	u32 scratch1, sdrc;
 	void (*resume_func)(void);

 	scratch1 = readl(&rtc->scratch1);
@ -473,8 +481,25 @@ static void rtc_only(void)
 	rtc_only_prcm_init();
 	sdram_init();

-	/* Disable EMIF_DEVOFF for normal operation and to exit self-refresh */
-	writel(0, &prm_device->emif_ctrl);
+	/* Check EMIF4D_SDRAM_CONFIG[31:29] SDRAM_TYPE */
+	/* Only perform leveling if SDRAM_TYPE = 3 (DDR3) */
+	sdrc = readl(AM43XX_EMIF_BASE + AM43XX_SDRAM_CONFIG_OFFSET);
+
+	sdrc &= AM43XX_SDRAM_TYPE_MASK;
+	sdrc >>= AM43XX_SDRAM_TYPE_SHIFT;
+
+	if (sdrc == AM43XX_SDRAM_TYPE_DDR3) {
+		writel(AM43XX_RDWRLVLFULL_START,
+		       AM43XX_EMIF_BASE +
+		       AM43XX_READ_WRITE_LEVELING_CTRL_OFFSET);
+		mdelay(1);
+
+am43xx_wait:
+		sdrc = readl(AM43XX_EMIF_BASE +
+			     AM43XX_READ_WRITE_LEVELING_CTRL_OFFSET);
+		if (sdrc == AM43XX_RDWRLVLFULL_START)
+			goto am43xx_wait;
+	}

 	resume_func = (void *)readl(&rtc->scratch0);
 	if (resume_func)
--- a/arch/arm/mach-omap2/am33xx/ddr.c
+++ b/arch/arm/mach-omap2/am33xx/ddr.c
@ -80,6 +80,11 @@ static void configure_mr(int nr, u32 cs)
 */
 void config_sdram_emif4d5(const struct emif_regs *regs, int nr)
 {
+#ifdef CONFIG_AM43XX
+	struct prm_device_inst *prm_device =
+			(struct prm_device_inst *)PRM_DEVICE_INST;
+#endif
+
 	writel(0xA0, &emif_reg[nr]->emif_pwr_mgmt_ctrl);
 	writel(0xA0, &emif_reg[nr]->emif_pwr_mgmt_ctrl_shdw);
 	writel(regs->zq_config, &emif_reg[nr]->emif_zq_config);
@ -126,6 +131,15 @@ void config_sdram_emif4d5(const struct emif_regs *regs, int nr)
 	writel(regs->ref_ctrl, &emif_reg[nr]->emif_sdram_ref_ctrl);
 	writel(regs->ref_ctrl, &emif_reg[nr]->emif_sdram_ref_ctrl_shdw);

+#ifdef CONFIG_AM43XX
+	/*
+	 * Disable EMIF_DEVOFF
+	 * -> Cold Boot: This is just rewriting the default register value.
+	 * -> RTC Resume: Must disable DEVOFF before leveling.
+	 */
+	writel(0, &prm_device->emif_ctrl);
+#endif
+
 	/* Perform hardware leveling for DDR3 */
 	if (emif_sdram_type(regs->sdram_config) == EMIF_SDRAM_TYPE_DDR3) {
 		writel(readl(&emif_reg[nr]->emif_ddr_ext_phy_ctrl_36) |
@ -138,6 +152,9 @@ void config_sdram_emif4d5(const struct emif_regs *regs, int nr)
 		/* Enable read leveling */
 		writel(0x80000000, &emif_reg[nr]->emif_rd_wr_lvl_ctl);

+		/* Wait 1ms because of L3 timeout error */
+		udelay(1000);
+
 		/*
 		 * Enable full read and write leveling.  Wait for read and write
 		 * leveling bit to clear RDWRLVLFULL_START bit 31
@ -256,8 +273,16 @@ static void ext_phy_settings_hwlvl(const struct emif_regs *regs, int nr)
 	 * Enable hardware leveling on the EMIF.  For details about these
 	 * magic values please see the EMIF registers section of the TRM.
 	 */
-	writel(0x08020080, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_1);
-	writel(0x08020080, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_1_shdw);
+	if (regs->emif_ddr_phy_ctlr_1 & 0x00040000) {
+		/* PHY_INVERT_CLKOUT = 1 */
+		writel(0x00040100, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_1);
+		writel(0x00040100, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_1_shdw);
+	} else {
+		/* PHY_INVERT_CLKOUT = 0 */
+		writel(0x08020080, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_1);
+		writel(0x08020080, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_1_shdw);
+	}
+
 	writel(0x00000000, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_22);
 	writel(0x00000000, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_22_shdw);
 	writel(0x00600020, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_23);
@ -286,8 +311,8 @@ static void ext_phy_settings_hwlvl(const struct emif_regs *regs, int nr)
 	writel(0x00000000, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_34_shdw);
 	writel(0x00000000, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_35);
 	writel(0x00000000, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_35_shdw);
-	writel(0x000000FF, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_36);
-	writel(0x000000FF, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_36_shdw);
+	writel(0x00000077, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_36);
+	writel(0x00000077, &emif_reg[nr]->emif_ddr_ext_phy_ctrl_36_shdw);

 	/*
 	 * Sequence to ensure that the PHY is again in a known state after
--- a/arch/arm/mach-socfpga/misc.c
+++ b/arch/arm/mach-socfpga/misc.c
@ -59,20 +59,10 @@ void enable_caches(void)
 #ifdef CONFIG_SYS_L2_PL310
 void v7_outer_cache_enable(void)
 {
-	/* Disable the L2 cache */
-	clrbits_le32(&pl310->pl310_ctrl, L2X0_CTRL_EN);
+	struct udevice *dev;

-	writel(0x0, &pl310->pl310_tag_latency_ctrl);
-	writel(0x10, &pl310->pl310_data_latency_ctrl);
-
-	/* enable BRESP, instruction and data prefetch, full line of zeroes */
-	setbits_le32(&pl310->pl310_aux_ctrl,
-		     L310_AUX_CTRL_DATA_PREFETCH_MASK |
-		     L310_AUX_CTRL_INST_PREFETCH_MASK |
-		     L310_SHARED_ATT_OVERRIDE_ENABLE);
-
-	/* Enable the L2 cache */
-	setbits_le32(&pl310->pl310_ctrl, L2X0_CTRL_EN);
+	if (uclass_get_device(UCLASS_CACHE, 0, &dev))
+		pr_err("cache controller driver NOT found!\n");
 }

 void v7_outer_cache_disable(void)
--- a/arch/microblaze/cpu/u-boot-spl.lds
+++ b/arch/microblaze/cpu/u-boot-spl.lds
@ -57,6 +57,6 @@ SECTIONS
 }

 #if defined(CONFIG_SPL_MAX_FOOTPRINT)
-ASSERT(__end - _start < (CONFIG_SPL_MAX_FOOTPRINT), \
+ASSERT(__end - _start <= (CONFIG_SPL_MAX_FOOTPRINT), \
        "SPL image plus BSS too big");
 #endif
--- a/board/ti/am43xx/board.c
+++ b/board/ti/am43xx/board.c
@ -244,7 +244,7 @@ const struct emif_regs ddr3_emif_regs_400Mhz_production = {
 	.read_idle_ctrl			= 0x00050000,
 	.zq_config			= 0x50074BE4,
 	.temp_alert_config		= 0x0,
-	.emif_ddr_phy_ctlr_1		= 0x0E004008,
+	.emif_ddr_phy_ctlr_1		= 0x00048008,
 	.emif_ddr_ext_phy_ctrl_1	= 0x08020080,
 	.emif_ddr_ext_phy_ctrl_2	= 0x00000066,
 	.emif_ddr_ext_phy_ctrl_3	= 0x00000091,
--- a/board/ti/am65x/Kconfig
+++ b/board/ti/am65x/Kconfig
@ -11,6 +11,7 @@ config TARGET_AM654_A53_EVM
 	bool "TI K3 based AM654 EVM running on A53"
 	select ARM64
 	select SOC_K3_AM6
+	select SYS_DISABLE_DCACHE_OPS

 config TARGET_AM654_R5_EVM
 	bool "TI K3 based AM654 EVM running on R5"
--- a/board/toradex/colibri-imx6ull/MAINTAINERS
+++ b/board/toradex/colibri-imx6ull/MAINTAINERS
@ -1,6 +1,5 @@
 Colibri iMX6ULL
 M:	Stefan Agner <stefan.agner@toradex.com>
-M:	Toradex ARM Support <support.arm@toradex.com>
 W:	http://developer.toradex.com/software/linux/linux-software
 W:	https://www.toradex.com/community
 S:	Maintained
--- a/board/toradex/colibri_imx7/MAINTAINERS
+++ b/board/toradex/colibri_imx7/MAINTAINERS
@ -1,6 +1,5 @@
 Colibri iMX7
 M:	Stefan Agner <stefan.agner@toradex.com>
-M:	Toradex ARM Support <support.arm@toradex.com>
 W:	http://developer.toradex.com/software/linux/linux-software
 W:	https://www.toradex.com/community
 S:	Maintained
--- a/cmd/pxe.c
+++ b/cmd/pxe.c
@ -24,6 +24,9 @@

 const char *pxe_default_paths[] = {
 #ifdef CONFIG_SYS_SOC
+#ifdef CONFIG_SYS_BOARD
+	"default-" CONFIG_SYS_ARCH "-" CONFIG_SYS_SOC "-" CONFIG_SYS_BOARD,
+#endif
 	"default-" CONFIG_SYS_ARCH "-" CONFIG_SYS_SOC,
 #endif
 	"default-" CONFIG_SYS_ARCH,
--- a/common/spl/spl_fit.c
+++ b/common/spl/spl_fit.c
@ -333,7 +333,7 @@ static int spl_fit_record_loadable(const void *fit, int images, int index,

 static int spl_fit_image_get_os(const void *fit, int noffset, uint8_t *os)
 {
-#if CONFIG_IS_ENABLED(FIT_IMAGE_TINY)
+#if CONFIG_IS_ENABLED(FIT_IMAGE_TINY) && !defined(CONFIG_SPL_OS_BOOT)
 	return -ENOTSUPP;
 #else
 	return fit_image_get_os(fit, noffset, os);
--- a/configs/am335x_evm_defconfig
+++ b/configs/am335x_evm_defconfig
@ -11,6 +11,7 @@ CONFIG_LOGLEVEL=3
 CONFIG_SYS_CONSOLE_INFO_QUIET=y
 CONFIG_VERSION_VARIABLE=y
 CONFIG_ARCH_MISC_INIT=y
+CONFIG_SPL_FIT_IMAGE_TINY=y
 CONFIG_SPL_ETH_SUPPORT=y
 # CONFIG_SPL_FS_EXT4 is not set
 CONFIG_SPL_MTD_SUPPORT=y
--- a/configs/da850evm_defconfig
+++ b/configs/da850evm_defconfig
@ -68,5 +68,10 @@ CONFIG_SYS_NS16550=y
 CONFIG_SPI=y
 CONFIG_DM_SPI=y
 CONFIG_DAVINCI_SPI=y
+CONFIG_USB=y
+CONFIG_DM_USB=y
+# CONFIG_SPL_DM_USB is not set
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_OHCI_DA8XX=y
 # CONFIG_FAT_WRITE is not set
 CONFIG_USE_TINY_PRINTF=y
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@ -14,6 +14,8 @@ source "drivers/block/Kconfig"

 source "drivers/bootcount/Kconfig"

+source "drivers/cache/Kconfig"
+
 source "drivers/clk/Kconfig"

 source "drivers/cpu/Kconfig"
--- a/drivers/Makefile
+++ b/drivers/Makefile
@ -77,6 +77,7 @@ obj-$(CONFIG_BIOSEMU) += bios_emulator/
 obj-y += block/
 obj-y += board/
 obj-$(CONFIG_BOOTCOUNT_LIMIT) += bootcount/
+obj-y += cache/
 obj-$(CONFIG_CPU) += cpu/
 obj-y += crypto/
 obj-$(CONFIG_FASTBOOT) += fastboot/
--- a/drivers/cache/Kconfig
+++ b/drivers/cache/Kconfig
@ -0,0 +1,25 @@
+#
+# Cache controllers
+#
+
+menu "Cache Controller drivers"
+
+config CACHE
+	bool "Enable Driver Model for Cache controllers"
+	depends on DM
+	help
+	  Enable driver model for cache controllers that are found on
+	  most CPU's. Cache is memory that the CPU can access directly and
+	  is usually located on the same chip. This uclass can be used for
+	  configuring settings that be found from a device tree file.
+
+config L2X0_CACHE
+	tristate "PL310 cache driver"
+	select CACHE
+	depends on ARM
+	help
+	  This driver is for the PL310 cache controller commonly found on
+	  ARMv7(32-bit) devices. The driver configures the cache settings
+	  found in the device tree.
+
+endmenu
--- a/drivers/cache/Makefile
+++ b/drivers/cache/Makefile
@ -0,0 +1,4 @@
+
+obj-$(CONFIG_CACHE) += cache-uclass.o
+obj-$(CONFIG_SANDBOX) += sandbox_cache.o
+obj-$(CONFIG_L2X0_CACHE) += cache-l2x0.o
--- a/drivers/cache/cache-l2x0.c
+++ b/drivers/cache/cache-l2x0.c
@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Intel Corporation <www.intel.com>
+ */
+#include <common.h>
+#include <command.h>
+#include <dm.h>
+
+#include <asm/io.h>
+#include <asm/pl310.h>
+
+static void l2c310_of_parse_and_init(struct udevice *dev)
+{
+	u32 tag[3] = { 0, 0, 0 };
+	u32 saved_reg, prefetch;
+	struct pl310_regs *regs = (struct pl310_regs *)dev_read_addr(dev);
+
+	/* Disable the L2 Cache */
+	clrbits_le32(&regs->pl310_ctrl, L2X0_CTRL_EN);
+
+	saved_reg = readl(&regs->pl310_aux_ctrl);
+	if (!dev_read_u32(dev, "prefetch-data", &prefetch)) {
+		if (prefetch)
+			saved_reg |= L310_AUX_CTRL_DATA_PREFETCH_MASK;
+		else
+			saved_reg &= ~L310_AUX_CTRL_DATA_PREFETCH_MASK;
+	}
+
+	if (!dev_read_u32(dev, "prefetch-instr", &prefetch)) {
+		if (prefetch)
+			saved_reg |= L310_AUX_CTRL_INST_PREFETCH_MASK;
+		else
+			saved_reg &= ~L310_AUX_CTRL_INST_PREFETCH_MASK;
+	}
+
+	saved_reg |= dev_read_bool(dev, "arm,shared-override");
+	writel(saved_reg, &regs->pl310_aux_ctrl);
+
+	saved_reg = readl(&regs->pl310_tag_latency_ctrl);
+	if (!dev_read_u32_array(dev, "arm,tag-latency", tag, 3))
+		saved_reg |= L310_LATENCY_CTRL_RD(tag[0] - 1) |
+			     L310_LATENCY_CTRL_WR(tag[1] - 1) |
+			     L310_LATENCY_CTRL_SETUP(tag[2] - 1);
+	writel(saved_reg, &regs->pl310_tag_latency_ctrl);
+
+	saved_reg = readl(&regs->pl310_data_latency_ctrl);
+	if (!dev_read_u32_array(dev, "arm,data-latency", tag, 3))
+		saved_reg |= L310_LATENCY_CTRL_RD(tag[0] - 1) |
+			     L310_LATENCY_CTRL_WR(tag[1] - 1) |
+			     L310_LATENCY_CTRL_SETUP(tag[2] - 1);
+	writel(saved_reg, &regs->pl310_data_latency_ctrl);
+
+	/* Enable the L2 cache */
+	setbits_le32(&regs->pl310_ctrl, L2X0_CTRL_EN);
+}
+
+static int l2x0_probe(struct udevice *dev)
+{
+	l2c310_of_parse_and_init(dev);
+
+	return 0;
+}
+
+
+static const struct udevice_id l2x0_ids[] = {
+	{ .compatible = "arm,pl310-cache" },
+	{}
+};
+
+U_BOOT_DRIVER(pl310_cache) = {
+	.name   = "pl310_cache",
+	.id     = UCLASS_CACHE,
+	.of_match = l2x0_ids,
+	.probe	= l2x0_probe,
+	.flags  = DM_FLAG_PRE_RELOC,
+};
--- a/drivers/cache/cache-uclass.c
+++ b/drivers/cache/cache-uclass.c
@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Intel Corporation <www.intel.com>
+ */
+
+#include <common.h>
+#include <cache.h>
+#include <dm.h>
+
+int cache_get_info(struct udevice *dev, struct cache_info *info)
+{
+	struct cache_ops *ops = cache_get_ops(dev);
+
+	if (!ops->get_info)
+		return -ENOSYS;
+
+	return ops->get_info(dev, info);
+}
+
+UCLASS_DRIVER(cache) = {
+	.id		= UCLASS_CACHE,
+	.name		= "cache",
+	.post_bind	= dm_scan_fdt_dev,
+};
--- a/drivers/cache/sandbox_cache.c
+++ b/drivers/cache/sandbox_cache.c
@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Intel Corporation <www.intel.com>
+ */
+
+#include <common.h>
+#include <cache.h>
+#include <dm.h>
+#include <errno.h>
+
+DECLARE_GLOBAL_DATA_PTR;
+
+static int sandbox_get_info(struct udevice *dev, struct cache_info *info)
+{
+	info->base = 0x11223344;
+
+	return 0;
+}
+
+static const struct cache_ops sandbox_cache_ops = {
+	.get_info	= sandbox_get_info,
+};
+
+static const struct udevice_id sandbox_cache_ids[] = {
+	{ .compatible = "sandbox,cache" },
+	{ }
+};
+
+U_BOOT_DRIVER(cache_sandbox) = {
+	.name		= "cache_sandbox",
+	.id		= UCLASS_CACHE,
+	.of_match	= sandbox_cache_ids,
+	.ops		= &sandbox_cache_ops,
+};
--- a/drivers/usb/host/Kconfig
+++ b/drivers/usb/host/Kconfig
@ -246,6 +246,11 @@ config USB_OHCI_GENERIC
 	---help---
 	  Enables support for generic OHCI controller.

+config USB_OHCI_DA8XX
+	bool "Support for da850 OHCI USB controller"
+	help
+	  Enable support for the da850 USB controller.
+
 endif # USB_OHCI_HCD

 config USB_UHCI_HCD
--- a/drivers/usb/host/ohci-da8xx.c
+++ b/drivers/usb/host/ohci-da8xx.c
@ -4,9 +4,54 @@
 */

 #include <common.h>
-
+#include <asm/io.h>
+#include <clk.h>
+#include <dm.h>
+#include <dm/ofnode.h>
+#include <generic-phy.h>
+#include <reset.h>
+#include "ohci.h"
 #include <asm/arch/da8xx-usb.h>

+struct da8xx_ohci {
+	ohci_t ohci;
+	struct clk *clocks;	/* clock list */
+	struct phy phy;
+	int clock_count;	/* number of clock in clock list */
+};
+
+static int usb_phy_on(void)
+{
+	unsigned long timeout;
+
+	clrsetbits_le32(&davinci_syscfg_regs->cfgchip2,
+			(CFGCHIP2_RESET | CFGCHIP2_PHYPWRDN |
+			CFGCHIP2_OTGPWRDN | CFGCHIP2_OTGMODE |
+			CFGCHIP2_REFFREQ | CFGCHIP2_USB1PHYCLKMUX),
+			(CFGCHIP2_SESENDEN | CFGCHIP2_VBDTCTEN |
+			CFGCHIP2_PHY_PLLON | CFGCHIP2_REFFREQ_24MHZ |
+			CFGCHIP2_USB2PHYCLKMUX | CFGCHIP2_USB1SUSPENDM));
+
+	/* wait until the usb phy pll locks */
+	timeout = get_timer(0);
+	while (get_timer(timeout) < 10) {
+		if (readl(&davinci_syscfg_regs->cfgchip2) & CFGCHIP2_PHYCLKGD)
+			return 1;
+	}
+
+	/* USB phy was not turned on */
+	return 0;
+}
+
+static void usb_phy_off(void)
+{
+	/* Power down the on-chip PHY. */
+	clrsetbits_le32(&davinci_syscfg_regs->cfgchip2,
+			CFGCHIP2_PHY_PLLON | CFGCHIP2_USB1SUSPENDM,
+			CFGCHIP2_PHYPWRDN | CFGCHIP2_OTGPWRDN |
+			CFGCHIP2_RESET);
+}
+
 int usb_cpu_init(void)
 {
 	/* enable psc for usb2.0 */
@ -37,3 +82,94 @@ int usb_cpu_init_fail(void)
 {
 	return usb_cpu_stop();
 }
+
+#if CONFIG_IS_ENABLED(DM_USB)
+static int ohci_da8xx_probe(struct udevice *dev)
+{
+	struct ohci_regs *regs = (struct ohci_regs *)devfdt_get_addr(dev);
+	struct da8xx_ohci *priv = dev_get_priv(dev);
+	int i, err, ret, clock_nb;
+
+	err = 0;
+	priv->clock_count = 0;
+	clock_nb = dev_count_phandle_with_args(dev, "clocks", "#clock-cells");
+	if (clock_nb > 0) {
+		priv->clocks = devm_kcalloc(dev, clock_nb, sizeof(struct clk),
+					    GFP_KERNEL);
+		if (!priv->clocks)
+			return -ENOMEM;
+
+		for (i = 0; i < clock_nb; i++) {
+			err = clk_get_by_index(dev, i, &priv->clocks[i]);
+			if (err < 0)
+				break;
+
+			err = clk_enable(&priv->clocks[i]);
+			if (err) {
+				dev_err(dev, "failed to enable clock %d\n", i);
+				clk_free(&priv->clocks[i]);
+				goto clk_err;
+			}
+			priv->clock_count++;
+		}
+	} else if (clock_nb != -ENOENT) {
+		dev_err(dev, "failed to get clock phandle(%d)\n", clock_nb);
+		return clock_nb;
+	}
+
+	err = usb_cpu_init();
+
+	if (err)
+		goto clk_err;
+
+	err = ohci_register(dev, regs);
+	if (err)
+		goto phy_err;
+
+	return 0;
+
+phy_err:
+	ret = usb_cpu_stop();
+	if (ret)
+		dev_err(dev, "failed to shutdown usb phy\n");
+
+clk_err:
+	ret = clk_release_all(priv->clocks, priv->clock_count);
+	if (ret)
+		dev_err(dev, "failed to disable all clocks\n");
+
+	return err;
+}
+
+static int ohci_da8xx_remove(struct udevice *dev)
+{
+	struct da8xx_ohci *priv = dev_get_priv(dev);
+	int ret;
+
+	ret = ohci_deregister(dev);
+	if (ret)
+		return ret;
+
+	ret = usb_cpu_stop();
+	if (ret)
+		return ret;
+
+	return clk_release_all(priv->clocks, priv->clock_count);
+}
+
+static const struct udevice_id da8xx_ohci_ids[] = {
+	{ .compatible = "ti,da830-ohci" },
+	{ }
+};
+
+U_BOOT_DRIVER(ohci_generic) = {
+	.name	= "ohci-da8xx",
+	.id	= UCLASS_USB,
+	.of_match = da8xx_ohci_ids,
+	.probe = ohci_da8xx_probe,
+	.remove = ohci_da8xx_remove,
+	.ops	= &ohci_usb_ops,
+	.priv_auto_alloc_size = sizeof(struct da8xx_ohci),
+	.flags	= DM_FLAG_ALLOC_PRIV_DMA,
+};
+#endif
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@ -2,6 +2,7 @@ config FS_BTRFS
 	bool "Enable BTRFS filesystem support"
 	select CRC32C
 	select LZO
+	select ZSTD
 	select RBTREE
 	help
 	  This provides a single-device read-only BTRFS support. BTRFS is a
--- a/fs/btrfs/btrfs_tree.h
+++ b/fs/btrfs/btrfs_tree.h
@ -647,8 +647,9 @@ enum btrfs_compression_type {
 	BTRFS_COMPRESS_NONE  = 0,
 	BTRFS_COMPRESS_ZLIB  = 1,
 	BTRFS_COMPRESS_LZO   = 2,
-	BTRFS_COMPRESS_TYPES = 2,
-	BTRFS_COMPRESS_LAST  = 3,
+	BTRFS_COMPRESS_ZSTD  = 3,
+	BTRFS_COMPRESS_TYPES = 3,
+	BTRFS_COMPRESS_LAST  = 4,
 };

 struct btrfs_file_extent_item {
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@ -6,7 +6,9 @@
 */

 #include "btrfs.h"
+#include <malloc.h>
 #include <linux/lzo.h>
+#include <linux/zstd.h>
 #include <u-boot/zlib.h>
 #include <asm/unaligned.h>

@ -108,6 +110,61 @@ static u32 decompress_zlib(const u8 *_cbuf, u32 clen, u8 *dbuf, u32 dlen)
 	return res;
 }

+#define ZSTD_BTRFS_MAX_WINDOWLOG 17
+#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
+
+static u32 decompress_zstd(const u8 *cbuf, u32 clen, u8 *dbuf, u32 dlen)
+{
+	ZSTD_DStream *dstream;
+	ZSTD_inBuffer in_buf;
+	ZSTD_outBuffer out_buf;
+	void *workspace;
+	size_t wsize;
+	u32 res = -1;
+
+	wsize = ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT);
+	workspace = malloc(wsize);
+	if (!workspace) {
+		debug("%s: cannot allocate workspace of size %zu\n", __func__,
+		      wsize);
+		return -1;
+	}
+
+	dstream = ZSTD_initDStream(ZSTD_BTRFS_MAX_INPUT, workspace, wsize);
+	if (!dstream) {
+		printf("%s: ZSTD_initDStream failed\n", __func__);
+		goto err_free;
+	}
+
+	in_buf.src = cbuf;
+	in_buf.pos = 0;
+	in_buf.size = clen;
+
+	out_buf.dst = dbuf;
+	out_buf.pos = 0;
+	out_buf.size = dlen;
+
+	while (1) {
+		size_t ret;
+
+		ret = ZSTD_decompressStream(dstream, &out_buf, &in_buf);
+		if (ZSTD_isError(ret)) {
+			printf("%s: ZSTD_decompressStream error %d\n", __func__,
+			       ZSTD_getErrorCode(ret));
+			goto err_free;
+		}
+
+		if (in_buf.pos >= clen || !ret)
+			break;
+	}
+
+	res = out_buf.pos;
+
+err_free:
+	free(workspace);
+	return res;
+}
+
 u32 btrfs_decompress(u8 type, const char *c, u32 clen, char *d, u32 dlen)
 {
 	u32 res;
@ -126,6 +183,8 @@ u32 btrfs_decompress(u8 type, const char *c, u32 clen, char *d, u32 dlen)
 		return decompress_zlib(cbuf, clen, dbuf, dlen);
 	case BTRFS_COMPRESS_LZO:
 		return decompress_lzo(cbuf, clen, dbuf, dlen);
+	case BTRFS_COMPRESS_ZSTD:
+		return decompress_zstd(cbuf, clen, dbuf, dlen);
 	default:
 		printf("%s: Unsupported compression in extent: %i\n", __func__,
 		       type);
--- a/include/cache.h
+++ b/include/cache.h
@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Intel Corporation <www.intel.com>
+ */
+
+#ifndef __CACHE_H
+#define __CACHE_H
+
+/*
+ * Structure for the cache controller
+ */
+struct cache_info {
+	phys_addr_t base; /* Base physical address of cache device. */
+};
+
+struct cache_ops {
+	/**
+	 * get_info() - Get basic cache info
+	 *
+	 * @dev:	Device to check (UCLASS_CACHE)
+	 * @info:	Place to put info
+	 * @return 0 if OK, -ve on error
+	 */
+	int (*get_info)(struct udevice *dev, struct cache_info *info);
+};
+
+#define cache_get_ops(dev)	((struct cache_ops *)(dev)->driver->ops)
+
+/**
+ * cache_get_info() - Get information about a cache controller
+ *
+ * @dev:	Device to check (UCLASS_CACHE)
+ * @info:	Returns cache info
+ * @return 0 if OK, -ve on error
+ */
+int cache_get_info(struct udevice *dev, struct cache_info *info);
+
+#endif
--- a/include/configs/da850evm.h
+++ b/include/configs/da850evm.h
@ -267,6 +267,14 @@
 #define CONFIG_ENV_SIZE		(16 << 10)
 #endif

+/* USB Configs */
+#define CONFIG_SYS_USB_OHCI_CPU_INIT
+#define CONFIG_USB_OHCI_NEW
+#define CONFIG_USB_STORAGE
+#define CONFIG_SYS_USB_OHCI_REGS_BASE		0x01E25000
+#define CONFIG_SYS_USB_OHCI_MAX_ROOT_PORTS	15
+#define CONFIG_SYS_USB_OHCI_SLOT_NAME		"da850evm"
+
 #ifndef CONFIG_DIRECT_NOR_BOOT
 /* defines for SPL */
 #define CONFIG_SYS_SPL_MALLOC_START	(CONFIG_SYS_TEXT_BASE - \
--- a/include/dm/uclass-id.h
+++ b/include/dm/uclass-id.h
@ -34,6 +34,7 @@ enum uclass_id {
 	UCLASS_BLK,		/* Block device */
 	UCLASS_BOARD,		/* Device information from hardware */
 	UCLASS_BOOTCOUNT,       /* Bootcount backing store */
+	UCLASS_CACHE,		/* Cache controller */
 	UCLASS_CLK,		/* Clock source, e.g. used by peripherals */
 	UCLASS_CPU,		/* CPU, typically part of an SoC */
 	UCLASS_CROS_EC,		/* Chrome OS EC */
--- a/include/image.h
+++ b/include/image.h
@ -1072,18 +1072,18 @@ int calculate_hash(const void *data, int data_len, const char *algo,
 * At present we only support signing on the host, and verification on the
 * device
 */
-#if defined(CONFIG_FIT_SIGNATURE)
-# ifdef USE_HOSTCC
+#if defined(USE_HOSTCC)
+# if defined(CONFIG_FIT_SIGNATURE)
 #  define IMAGE_ENABLE_SIGN	1
 #  define IMAGE_ENABLE_VERIFY	1
-# include  <openssl/evp.h>
-#else
+#  include <openssl/evp.h>
+# else
 #  define IMAGE_ENABLE_SIGN	0
-#  define IMAGE_ENABLE_VERIFY	1
+#  define IMAGE_ENABLE_VERIFY	0
 # endif
 #else
 # define IMAGE_ENABLE_SIGN	0
-# define IMAGE_ENABLE_VERIFY	0
+# define IMAGE_ENABLE_VERIFY	CONFIG_IS_ENABLED(FIT_SIGNATURE)
 #endif

 #ifdef USE_HOSTCC
--- a/include/linux/xxhash.h
+++ b/include/linux/xxhash.h
@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: (GPL-2.0 or BSD-2-Clause) */
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2016, Yann Collet.
+ *
+ * You can contact the author at:
+ * - xxHash homepage: http://cyan4973.github.io/xxHash/
+ * - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * Notice extracted from xxHash homepage:
+ *
+ * xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+ * It also successfully passes all tests from the SMHasher suite.
+ *
+ * Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2
+ * Duo @3GHz)
+ *
+ * Name            Speed       Q.Score   Author
+ * xxHash          5.4 GB/s     10
+ * CrapWow         3.2 GB/s      2       Andrew
+ * MumurHash 3a    2.7 GB/s     10       Austin Appleby
+ * SpookyHash      2.0 GB/s     10       Bob Jenkins
+ * SBox            1.4 GB/s      9       Bret Mulvey
+ * Lookup3         1.2 GB/s      9       Bob Jenkins
+ * SuperFastHash   1.2 GB/s      1       Paul Hsieh
+ * CityHash64      1.05 GB/s    10       Pike & Alakuijala
+ * FNV             0.55 GB/s     5       Fowler, Noll, Vo
+ * CRC32           0.43 GB/s     9
+ * MD5-32          0.33 GB/s    10       Ronald L. Rivest
+ * SHA1-32         0.28 GB/s    10
+ *
+ * Q.Score is a measure of quality of the hash function.
+ * It depends on successfully passing SMHasher test set.
+ * 10 is a perfect score.
+ *
+ * A 64-bits version, named xxh64 offers much better speed,
+ * but for 64-bits applications only.
+ * Name     Speed on 64 bits    Speed on 32 bits
+ * xxh64       13.8 GB/s            1.9 GB/s
+ * xxh32        6.8 GB/s            6.0 GB/s
+ */
+
+#ifndef XXHASH_H
+#define XXHASH_H
+
+#include <linux/types.h>
+
+/*-****************************
+ * Simple Hash Functions
+ *****************************/
+
+/**
+ * xxh32() - calculate the 32-bit hash of the input with a given seed.
+ *
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ * @seed:   The seed can be used to alter the result predictably.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+ *
+ * Return:  The 32-bit hash of the data.
+ */
+uint32_t xxh32(const void *input, size_t length, uint32_t seed);
+
+/**
+ * xxh64() - calculate the 64-bit hash of the input with a given seed.
+ *
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ * @seed:   The seed can be used to alter the result predictably.
+ *
+ * This function runs 2x faster on 64-bit systems, but slower on 32-bit systems.
+ *
+ * Return:  The 64-bit hash of the data.
+ */
+uint64_t xxh64(const void *input, size_t length, uint64_t seed);
+
+/**
+ * xxhash() - calculate wordsize hash of the input with a given seed
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ * @seed:   The seed can be used to alter the result predictably.
+ *
+ * If the hash does not need to be comparable between machines with
+ * different word sizes, this function will call whichever of xxh32()
+ * or xxh64() is faster.
+ *
+ * Return:  wordsize hash of the data.
+ */
+
+static inline unsigned long xxhash(const void *input, size_t length,
+				   uint64_t seed)
+{
+#if BITS_PER_LONG == 64
+       return xxh64(input, length, seed);
+#else
+       return xxh32(input, length, seed);
+#endif
+}
+
+/*-****************************
+ * Streaming Hash Functions
+ *****************************/
+
+/*
+ * These definitions are only meant to allow allocation of XXH state
+ * statically, on stack, or in a struct for example.
+ * Do not use members directly.
+ */
+
+/**
+ * struct xxh32_state - private xxh32 state, do not use members directly
+ */
+struct xxh32_state {
+	uint32_t total_len_32;
+	uint32_t large_len;
+	uint32_t v1;
+	uint32_t v2;
+	uint32_t v3;
+	uint32_t v4;
+	uint32_t mem32[4];
+	uint32_t memsize;
+};
+
+/**
+ * struct xxh32_state - private xxh64 state, do not use members directly
+ */
+struct xxh64_state {
+	uint64_t total_len;
+	uint64_t v1;
+	uint64_t v2;
+	uint64_t v3;
+	uint64_t v4;
+	uint64_t mem64[4];
+	uint32_t memsize;
+};
+
+/**
+ * xxh32_reset() - reset the xxh32 state to start a new hashing operation
+ *
+ * @state: The xxh32 state to reset.
+ * @seed:  Initialize the hash state with this seed.
+ *
+ * Call this function on any xxh32_state to prepare for a new hashing operation.
+ */
+void xxh32_reset(struct xxh32_state *state, uint32_t seed);
+
+/**
+ * xxh32_update() - hash the data given and update the xxh32 state
+ *
+ * @state:  The xxh32 state to update.
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ *
+ * After calling xxh32_reset() call xxh32_update() as many times as necessary.
+ *
+ * Return:  Zero on success, otherwise an error code.
+ */
+int xxh32_update(struct xxh32_state *state, const void *input, size_t length);
+
+/**
+ * xxh32_digest() - produce the current xxh32 hash
+ *
+ * @state: Produce the current xxh32 hash of this state.
+ *
+ * A hash value can be produced at any time. It is still possible to continue
+ * inserting input into the hash state after a call to xxh32_digest(), and
+ * generate new hashes later on, by calling xxh32_digest() again.
+ *
+ * Return: The xxh32 hash stored in the state.
+ */
+uint32_t xxh32_digest(const struct xxh32_state *state);
+
+/**
+ * xxh64_reset() - reset the xxh64 state to start a new hashing operation
+ *
+ * @state: The xxh64 state to reset.
+ * @seed:  Initialize the hash state with this seed.
+ */
+void xxh64_reset(struct xxh64_state *state, uint64_t seed);
+
+/**
+ * xxh64_update() - hash the data given and update the xxh64 state
+ * @state:  The xxh64 state to update.
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ *
+ * After calling xxh64_reset() call xxh64_update() as many times as necessary.
+ *
+ * Return:  Zero on success, otherwise an error code.
+ */
+int xxh64_update(struct xxh64_state *state, const void *input, size_t length);
+
+/**
+ * xxh64_digest() - produce the current xxh64 hash
+ *
+ * @state: Produce the current xxh64 hash of this state.
+ *
+ * A hash value can be produced at any time. It is still possible to continue
+ * inserting input into the hash state after a call to xxh64_digest(), and
+ * generate new hashes later on, by calling xxh64_digest() again.
+ *
+ * Return: The xxh64 hash stored in the state.
+ */
+uint64_t xxh64_digest(const struct xxh64_state *state);
+
+/*-**************************
+ * Utils
+ ***************************/
+
+/**
+ * xxh32_copy_state() - copy the source state into the destination state
+ *
+ * @src: The source xxh32 state.
+ * @dst: The destination xxh32 state.
+ */
+void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src);
+
+/**
+ * xxh64_copy_state() - copy the source state into the destination state
+ *
+ * @src: The source xxh64 state.
+ * @dst: The destination xxh64 state.
+ */
+void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src);
+
+#endif /* XXHASH_H */
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
--- a/lib/Kconfig
+++ b/lib/Kconfig
@ -327,6 +327,9 @@ config MD5
 config CRC32C
 	bool

+config XXHASH
+	bool
+
 endmenu

 menu "Compression Support"
@ -371,6 +374,12 @@ config ZLIB
 	help
 	  This enables ZLIB compression lib.

+config ZSTD
+	bool "Enable Zstandard decompression support"
+	select XXHASH
+	help
+	  This enables Zstandard decompression library.
+
 config SPL_LZ4
 	bool "Enable LZ4 decompression support in SPL"
 	help
@ -395,6 +404,12 @@ config SPL_ZLIB
 	help
 	  This enables compression lib for SPL boot.

+config SPL_ZSTD
+	bool "Enable Zstandard decompression support in SPL"
+	select XXHASH
+	help
+	  This enables Zstandard decompression library in the SPL.
+
 endmenu

 config ERRNO_STR
--- a/lib/Makefile
+++ b/lib/Makefile
@ -37,6 +37,7 @@ obj-$(CONFIG_GENERATE_SMBIOS_TABLE) += smbios.o
 obj-$(CONFIG_IMAGE_SPARSE) += image-sparse.o
 obj-y += ldiv.o
 obj-$(CONFIG_MD5) += md5.o
+obj-$(CONFIG_XXHASH) += xxhash.o
 obj-y += net_utils.o
 obj-$(CONFIG_PHYSMEM) += physmem.o
 obj-y += rc4.o
@ -58,6 +59,7 @@ obj-$(CONFIG_SHA1) += sha1.o
 obj-$(CONFIG_SHA256) += sha256.o

 obj-$(CONFIG_$(SPL_)ZLIB) += zlib/
+obj-$(CONFIG_$(SPL_)ZSTD) += zstd/
 obj-$(CONFIG_$(SPL_)GZIP) += gunzip.o
 obj-$(CONFIG_$(SPL_)LZO) += lzo/
 obj-$(CONFIG_$(SPL_)LZ4) += lz4_wrapper.o
--- a/lib/display_options.c
+++ b/lib/display_options.c
@ -23,7 +23,9 @@ char *display_options_get_banner_priv(bool newlines, const char *build_tag,
 				build_tag);
 	if (len > size - 3)
 		len = size - 3;
-	strcpy(buf + len, "\n\n");
+	if (len < 0)
+		len = 0;
+	snprintf(buf + len, size - len, "\n\n");

 	return buf;
 }
--- a/lib/xxhash.c
+++ b/lib/xxhash.c
@ -0,0 +1,467 @@
+// SPDX-License-Identifier: (GPL-2.0 or BSD-2-Clause)
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2016, Yann Collet.
+ *
+ * You can contact the author at:
+ * - xxHash homepage: http://cyan4973.github.io/xxHash/
+ * - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+#include <asm/unaligned.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/compat.h>
+#include <linux/string.h>
+#include <linux/xxhash.h>
+
+/*-*************************************
+ * Macros
+ **************************************/
+#define xxh_rotl32(x, r) ((x << r) | (x >> (32 - r)))
+#define xxh_rotl64(x, r) ((x << r) | (x >> (64 - r)))
+
+#ifdef __LITTLE_ENDIAN
+# define XXH_CPU_LITTLE_ENDIAN 1
+#else
+# define XXH_CPU_LITTLE_ENDIAN 0
+#endif
+
+/*-*************************************
+ * Constants
+ **************************************/
+static const uint32_t PRIME32_1 = 2654435761U;
+static const uint32_t PRIME32_2 = 2246822519U;
+static const uint32_t PRIME32_3 = 3266489917U;
+static const uint32_t PRIME32_4 =  668265263U;
+static const uint32_t PRIME32_5 =  374761393U;
+
+static const uint64_t PRIME64_1 = 11400714785074694791ULL;
+static const uint64_t PRIME64_2 = 14029467366897019727ULL;
+static const uint64_t PRIME64_3 =  1609587929392839161ULL;
+static const uint64_t PRIME64_4 =  9650029242287828579ULL;
+static const uint64_t PRIME64_5 =  2870177450012600261ULL;
+
+/*-**************************
+ *  Utils
+ ***************************/
+void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+EXPORT_SYMBOL(xxh32_copy_state);
+
+void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+EXPORT_SYMBOL(xxh64_copy_state);
+
+/*-***************************
+ * Simple Hash Functions
+ ****************************/
+static uint32_t xxh32_round(uint32_t seed, const uint32_t input)
+{
+	seed += input * PRIME32_2;
+	seed = xxh_rotl32(seed, 13);
+	seed *= PRIME32_1;
+	return seed;
+}
+
+uint32_t xxh32(const void *input, const size_t len, const uint32_t seed)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *b_end = p + len;
+	uint32_t h32;
+
+	if (len >= 16) {
+		const uint8_t *const limit = b_end - 16;
+		uint32_t v1 = seed + PRIME32_1 + PRIME32_2;
+		uint32_t v2 = seed + PRIME32_2;
+		uint32_t v3 = seed + 0;
+		uint32_t v4 = seed - PRIME32_1;
+
+		do {
+			v1 = xxh32_round(v1, get_unaligned_le32(p));
+			p += 4;
+			v2 = xxh32_round(v2, get_unaligned_le32(p));
+			p += 4;
+			v3 = xxh32_round(v3, get_unaligned_le32(p));
+			p += 4;
+			v4 = xxh32_round(v4, get_unaligned_le32(p));
+			p += 4;
+		} while (p <= limit);
+
+		h32 = xxh_rotl32(v1, 1) + xxh_rotl32(v2, 7) +
+			xxh_rotl32(v3, 12) + xxh_rotl32(v4, 18);
+	} else {
+		h32 = seed + PRIME32_5;
+	}
+
+	h32 += (uint32_t)len;
+
+	while (p + 4 <= b_end) {
+		h32 += get_unaligned_le32(p) * PRIME32_3;
+		h32 = xxh_rotl32(h32, 17) * PRIME32_4;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h32 += (*p) * PRIME32_5;
+		h32 = xxh_rotl32(h32, 11) * PRIME32_1;
+		p++;
+	}
+
+	h32 ^= h32 >> 15;
+	h32 *= PRIME32_2;
+	h32 ^= h32 >> 13;
+	h32 *= PRIME32_3;
+	h32 ^= h32 >> 16;
+
+	return h32;
+}
+EXPORT_SYMBOL(xxh32);
+
+static uint64_t xxh64_round(uint64_t acc, const uint64_t input)
+{
+	acc += input * PRIME64_2;
+	acc = xxh_rotl64(acc, 31);
+	acc *= PRIME64_1;
+	return acc;
+}
+
+static uint64_t xxh64_merge_round(uint64_t acc, uint64_t val)
+{
+	val = xxh64_round(0, val);
+	acc ^= val;
+	acc = acc * PRIME64_1 + PRIME64_4;
+	return acc;
+}
+
+uint64_t xxh64(const void *input, const size_t len, const uint64_t seed)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *const b_end = p + len;
+	uint64_t h64;
+
+	if (len >= 32) {
+		const uint8_t *const limit = b_end - 32;
+		uint64_t v1 = seed + PRIME64_1 + PRIME64_2;
+		uint64_t v2 = seed + PRIME64_2;
+		uint64_t v3 = seed + 0;
+		uint64_t v4 = seed - PRIME64_1;
+
+		do {
+			v1 = xxh64_round(v1, get_unaligned_le64(p));
+			p += 8;
+			v2 = xxh64_round(v2, get_unaligned_le64(p));
+			p += 8;
+			v3 = xxh64_round(v3, get_unaligned_le64(p));
+			p += 8;
+			v4 = xxh64_round(v4, get_unaligned_le64(p));
+			p += 8;
+		} while (p <= limit);
+
+		h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) +
+			xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18);
+		h64 = xxh64_merge_round(h64, v1);
+		h64 = xxh64_merge_round(h64, v2);
+		h64 = xxh64_merge_round(h64, v3);
+		h64 = xxh64_merge_round(h64, v4);
+
+	} else {
+		h64  = seed + PRIME64_5;
+	}
+
+	h64 += (uint64_t)len;
+
+	while (p + 8 <= b_end) {
+		const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p));
+
+		h64 ^= k1;
+		h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;
+		p += 8;
+	}
+
+	if (p + 4 <= b_end) {
+		h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1;
+		h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h64 ^= (*p) * PRIME64_5;
+		h64 = xxh_rotl64(h64, 11) * PRIME64_1;
+		p++;
+	}
+
+	h64 ^= h64 >> 33;
+	h64 *= PRIME64_2;
+	h64 ^= h64 >> 29;
+	h64 *= PRIME64_3;
+	h64 ^= h64 >> 32;
+
+	return h64;
+}
+EXPORT_SYMBOL(xxh64);
+
+/*-**************************************************
+ * Advanced Hash Functions
+ ***************************************************/
+void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed)
+{
+	/* use a local state for memcpy() to avoid strict-aliasing warnings */
+	struct xxh32_state state;
+
+	memset(&state, 0, sizeof(state));
+	state.v1 = seed + PRIME32_1 + PRIME32_2;
+	state.v2 = seed + PRIME32_2;
+	state.v3 = seed + 0;
+	state.v4 = seed - PRIME32_1;
+	memcpy(statePtr, &state, sizeof(state));
+}
+EXPORT_SYMBOL(xxh32_reset);
+
+void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed)
+{
+	/* use a local state for memcpy() to avoid strict-aliasing warnings */
+	struct xxh64_state state;
+
+	memset(&state, 0, sizeof(state));
+	state.v1 = seed + PRIME64_1 + PRIME64_2;
+	state.v2 = seed + PRIME64_2;
+	state.v3 = seed + 0;
+	state.v4 = seed - PRIME64_1;
+	memcpy(statePtr, &state, sizeof(state));
+}
+EXPORT_SYMBOL(xxh64_reset);
+
+int xxh32_update(struct xxh32_state *state, const void *input, const size_t len)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *const b_end = p + len;
+
+	if (input == NULL)
+		return -EINVAL;
+
+	state->total_len_32 += (uint32_t)len;
+	state->large_len |= (len >= 16) | (state->total_len_32 >= 16);
+
+	if (state->memsize + len < 16) { /* fill in tmp buffer */
+		memcpy((uint8_t *)(state->mem32) + state->memsize, input, len);
+		state->memsize += (uint32_t)len;
+		return 0;
+	}
+
+	if (state->memsize) { /* some data left from previous update */
+		const uint32_t *p32 = state->mem32;
+
+		memcpy((uint8_t *)(state->mem32) + state->memsize, input,
+			16 - state->memsize);
+
+		state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32));
+		p32++;
+		state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32));
+		p32++;
+		state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32));
+		p32++;
+		state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32));
+		p32++;
+
+		p += 16-state->memsize;
+		state->memsize = 0;
+	}
+
+	if (p <= b_end - 16) {
+		const uint8_t *const limit = b_end - 16;
+		uint32_t v1 = state->v1;
+		uint32_t v2 = state->v2;
+		uint32_t v3 = state->v3;
+		uint32_t v4 = state->v4;
+
+		do {
+			v1 = xxh32_round(v1, get_unaligned_le32(p));
+			p += 4;
+			v2 = xxh32_round(v2, get_unaligned_le32(p));
+			p += 4;
+			v3 = xxh32_round(v3, get_unaligned_le32(p));
+			p += 4;
+			v4 = xxh32_round(v4, get_unaligned_le32(p));
+			p += 4;
+		} while (p <= limit);
+
+		state->v1 = v1;
+		state->v2 = v2;
+		state->v3 = v3;
+		state->v4 = v4;
+	}
+
+	if (p < b_end) {
+		memcpy(state->mem32, p, (size_t)(b_end-p));
+		state->memsize = (uint32_t)(b_end-p);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(xxh32_update);
+
+uint32_t xxh32_digest(const struct xxh32_state *state)
+{
+	const uint8_t *p = (const uint8_t *)state->mem32;
+	const uint8_t *const b_end = (const uint8_t *)(state->mem32) +
+		state->memsize;
+	uint32_t h32;
+
+	if (state->large_len) {
+		h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) +
+			xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18);
+	} else {
+		h32 = state->v3 /* == seed */ + PRIME32_5;
+	}
+
+	h32 += state->total_len_32;
+
+	while (p + 4 <= b_end) {
+		h32 += get_unaligned_le32(p) * PRIME32_3;
+		h32 = xxh_rotl32(h32, 17) * PRIME32_4;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h32 += (*p) * PRIME32_5;
+		h32 = xxh_rotl32(h32, 11) * PRIME32_1;
+		p++;
+	}
+
+	h32 ^= h32 >> 15;
+	h32 *= PRIME32_2;
+	h32 ^= h32 >> 13;
+	h32 *= PRIME32_3;
+	h32 ^= h32 >> 16;
+
+	return h32;
+}
+EXPORT_SYMBOL(xxh32_digest);
+
+int xxh64_update(struct xxh64_state *state, const void *input, const size_t len)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *const b_end = p + len;
+
+	if (input == NULL)
+		return -EINVAL;
+
+	state->total_len += len;
+
+	if (state->memsize + len < 32) { /* fill in tmp buffer */
+		memcpy(((uint8_t *)state->mem64) + state->memsize, input, len);
+		state->memsize += (uint32_t)len;
+		return 0;
+	}
+
+	if (state->memsize) { /* tmp buffer is full */
+		uint64_t *p64 = state->mem64;
+
+		memcpy(((uint8_t *)p64) + state->memsize, input,
+			32 - state->memsize);
+
+		state->v1 = xxh64_round(state->v1, get_unaligned_le64(p64));
+		p64++;
+		state->v2 = xxh64_round(state->v2, get_unaligned_le64(p64));
+		p64++;
+		state->v3 = xxh64_round(state->v3, get_unaligned_le64(p64));
+		p64++;
+		state->v4 = xxh64_round(state->v4, get_unaligned_le64(p64));
+
+		p += 32 - state->memsize;
+		state->memsize = 0;
+	}
+
+	if (p + 32 <= b_end) {
+		const uint8_t *const limit = b_end - 32;
+		uint64_t v1 = state->v1;
+		uint64_t v2 = state->v2;
+		uint64_t v3 = state->v3;
+		uint64_t v4 = state->v4;
+
+		do {
+			v1 = xxh64_round(v1, get_unaligned_le64(p));
+			p += 8;
+			v2 = xxh64_round(v2, get_unaligned_le64(p));
+			p += 8;
+			v3 = xxh64_round(v3, get_unaligned_le64(p));
+			p += 8;
+			v4 = xxh64_round(v4, get_unaligned_le64(p));
+			p += 8;
+		} while (p <= limit);
+
+		state->v1 = v1;
+		state->v2 = v2;
+		state->v3 = v3;
+		state->v4 = v4;
+	}
+
+	if (p < b_end) {
+		memcpy(state->mem64, p, (size_t)(b_end-p));
+		state->memsize = (uint32_t)(b_end - p);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(xxh64_update);
+
+uint64_t xxh64_digest(const struct xxh64_state *state)
+{
+	const uint8_t *p = (const uint8_t *)state->mem64;
+	const uint8_t *const b_end = (const uint8_t *)state->mem64 +
+		state->memsize;
+	uint64_t h64;
+
+	if (state->total_len >= 32) {
+		const uint64_t v1 = state->v1;
+		const uint64_t v2 = state->v2;
+		const uint64_t v3 = state->v3;
+		const uint64_t v4 = state->v4;
+
+		h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) +
+			xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18);
+		h64 = xxh64_merge_round(h64, v1);
+		h64 = xxh64_merge_round(h64, v2);
+		h64 = xxh64_merge_round(h64, v3);
+		h64 = xxh64_merge_round(h64, v4);
+	} else {
+		h64  = state->v3 + PRIME64_5;
+	}
+
+	h64 += (uint64_t)state->total_len;
+
+	while (p + 8 <= b_end) {
+		const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p));
+
+		h64 ^= k1;
+		h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;
+		p += 8;
+	}
+
+	if (p + 4 <= b_end) {
+		h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1;
+		h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h64 ^= (*p) * PRIME64_5;
+		h64 = xxh_rotl64(h64, 11) * PRIME64_1;
+		p++;
+	}
+
+	h64 ^= h64 >> 33;
+	h64 *= PRIME64_2;
+	h64 ^= h64 >> 29;
+	h64 *= PRIME64_3;
+	h64 ^= h64 >> 32;
+
+	return h64;
+}
+EXPORT_SYMBOL(xxh64_digest);
--- a/lib/zstd/Makefile
+++ b/lib/zstd/Makefile
@ -0,0 +1,4 @@
+obj-y += zstd_decompress.o
+
+zstd_decompress-y := huf_decompress.o decompress.o \
+		     entropy_common.o fse_decompress.o zstd_common.o
--- a/lib/zstd/bitstream.h
+++ b/lib/zstd/bitstream.h
@ -0,0 +1,344 @@
+/* SPDX-License-Identifier: (GPL-2.0 or BSD-2-Clause) */
+/*
+ * bitstream
+ * Part of FSE library
+ * header file (to include)
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include "error_private.h" /* error codes and messages */
+#include "mem.h"	   /* unaligned access routines */
+
+/*=========================================
+*  Target specific
+=========================================*/
+#define STREAM_ACCUMULATOR_MIN_32 25
+#define STREAM_ACCUMULATOR_MIN_64 57
+#define STREAM_ACCUMULATOR_MIN ((U32)(ZSTD_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+/*-******************************************
+*  bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+*  A critical property of these streams is that they encode and decode in **reverse** direction.
+*  So the first bit sequence you add will be the last to be read, like a LIFO stack.
+*/
+typedef struct {
+	size_t bitContainer;
+	int bitPos;
+	char *startPtr;
+	char *ptr;
+	char *endPtr;
+} BIT_CStream_t;
+
+ZSTD_STATIC size_t BIT_initCStream(BIT_CStream_t *bitC, void *dstBuffer, size_t dstCapacity);
+ZSTD_STATIC void BIT_addBits(BIT_CStream_t *bitC, size_t value, unsigned nbBits);
+ZSTD_STATIC void BIT_flushBits(BIT_CStream_t *bitC);
+ZSTD_STATIC size_t BIT_closeCStream(BIT_CStream_t *bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+*  bitStream will never write outside of this buffer.
+*  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+*  bits are first added to a local register.
+*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+*  Writing data into memory is an explicit operation, performed by the flushBits function.
+*  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+*  After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+*  Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+*  Last operation is to close the bitStream.
+*  The function returns the final size of CStream in bytes.
+*  If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct {
+	size_t bitContainer;
+	unsigned bitsConsumed;
+	const char *ptr;
+	const char *start;
+} BIT_DStream_t;
+
+typedef enum {
+	BIT_DStream_unfinished = 0,
+	BIT_DStream_endOfBuffer = 1,
+	BIT_DStream_completed = 2,
+	BIT_DStream_overflow = 3
+} BIT_DStream_status; /* result of BIT_reloadDStream() */
+/* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+ZSTD_STATIC size_t BIT_initDStream(BIT_DStream_t *bitD, const void *srcBuffer, size_t srcSize);
+ZSTD_STATIC size_t BIT_readBits(BIT_DStream_t *bitD, unsigned nbBits);
+ZSTD_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t *bitD);
+ZSTD_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t *bitD);
+
+/* Start by invoking BIT_initDStream().
+*  A chunk of the bitStream is then stored into a local register.
+*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+*  You can then retrieve bitFields stored into the local register, **in reverse order**.
+*  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+*  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+*  Otherwise, it can be less than that, so proceed accordingly.
+*  Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+/*-****************************************
+*  unsafe API
+******************************************/
+ZSTD_STATIC void BIT_addBitsFast(BIT_CStream_t *bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+ZSTD_STATIC void BIT_flushBitsFast(BIT_CStream_t *bitC);
+/* unsafe version; does not check buffer overflow */
+
+ZSTD_STATIC size_t BIT_readBitsFast(BIT_DStream_t *bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+/*-**************************************************************
+*  Internal functions
+****************************************************************/
+ZSTD_STATIC unsigned BIT_highbit32(register U32 val) { return 31 - __builtin_clz(val); }
+
+/*=====    Local Constants   =====*/
+static const unsigned BIT_mask[] = {0,       1,       3,       7,	0xF,      0x1F,     0x3F,     0x7F,      0xFF,
+				    0x1FF,   0x3FF,   0x7FF,   0xFFF,    0x1FFF,   0x3FFF,   0x7FFF,   0xFFFF,    0x1FFFF,
+				    0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF, 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF}; /* up to 26 bits */
+
+/*-**************************************************************
+*  bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ *  `dstCapacity` must be > sizeof(void*)
+ *  @return : 0 if success,
+			  otherwise an error code (can be tested using ERR_isError() ) */
+ZSTD_STATIC size_t BIT_initCStream(BIT_CStream_t *bitC, void *startPtr, size_t dstCapacity)
+{
+	bitC->bitContainer = 0;
+	bitC->bitPos = 0;
+	bitC->startPtr = (char *)startPtr;
+	bitC->ptr = bitC->startPtr;
+	bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->ptr);
+	if (dstCapacity <= sizeof(bitC->ptr))
+		return ERROR(dstSize_tooSmall);
+	return 0;
+}
+
+/*! BIT_addBits() :
+	can add up to 26 bits into `bitC`.
+	Does not check for register overflow ! */
+ZSTD_STATIC void BIT_addBits(BIT_CStream_t *bitC, size_t value, unsigned nbBits)
+{
+	bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+	bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ *  works only if `value` is _clean_, meaning all high bits above nbBits are 0 */
+ZSTD_STATIC void BIT_addBitsFast(BIT_CStream_t *bitC, size_t value, unsigned nbBits)
+{
+	bitC->bitContainer |= value << bitC->bitPos;
+	bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ *  unsafe version; does not check buffer overflow */
+ZSTD_STATIC void BIT_flushBitsFast(BIT_CStream_t *bitC)
+{
+	size_t const nbBytes = bitC->bitPos >> 3;
+	ZSTD_writeLEST(bitC->ptr, bitC->bitContainer);
+	bitC->ptr += nbBytes;
+	bitC->bitPos &= 7;
+	bitC->bitContainer >>= nbBytes * 8; /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */
+}
+
+/*! BIT_flushBits() :
+ *  safe version; check for buffer overflow, and prevents it.
+ *  note : does not signal buffer overflow. This will be revealed later on using BIT_closeCStream() */
+ZSTD_STATIC void BIT_flushBits(BIT_CStream_t *bitC)
+{
+	size_t const nbBytes = bitC->bitPos >> 3;
+	ZSTD_writeLEST(bitC->ptr, bitC->bitContainer);
+	bitC->ptr += nbBytes;
+	if (bitC->ptr > bitC->endPtr)
+		bitC->ptr = bitC->endPtr;
+	bitC->bitPos &= 7;
+	bitC->bitContainer >>= nbBytes * 8; /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */
+}
+
+/*! BIT_closeCStream() :
+ *  @return : size of CStream, in bytes,
+			  or 0 if it could not fit into dstBuffer */
+ZSTD_STATIC size_t BIT_closeCStream(BIT_CStream_t *bitC)
+{
+	BIT_addBitsFast(bitC, 1, 1); /* endMark */
+	BIT_flushBits(bitC);
+
+	if (bitC->ptr >= bitC->endPtr)
+		return 0; /* doesn't fit within authorized budget : cancel */
+
+	return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+/*-********************************************************
+* bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+*   Initialize a BIT_DStream_t.
+*   `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+*   `srcSize` must be the *exact* size of the bitStream, in bytes.
+*   @return : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+ZSTD_STATIC size_t BIT_initDStream(BIT_DStream_t *bitD, const void *srcBuffer, size_t srcSize)
+{
+	if (srcSize < 1) {
+		memset(bitD, 0, sizeof(*bitD));
+		return ERROR(srcSize_wrong);
+	}
+
+	if (srcSize >= sizeof(bitD->bitContainer)) { /* normal case */
+		bitD->start = (const char *)srcBuffer;
+		bitD->ptr = (const char *)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+		bitD->bitContainer = ZSTD_readLEST(bitD->ptr);
+		{
+			BYTE const lastByte = ((const BYTE *)srcBuffer)[srcSize - 1];
+			bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */
+			if (lastByte == 0)
+				return ERROR(GENERIC); /* endMark not present */
+		}
+	} else {
+		bitD->start = (const char *)srcBuffer;
+		bitD->ptr = bitD->start;
+		bitD->bitContainer = *(const BYTE *)(bitD->start);
+		switch (srcSize) {
+		case 7: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[6]) << (sizeof(bitD->bitContainer) * 8 - 16);
+		case 6: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[5]) << (sizeof(bitD->bitContainer) * 8 - 24);
+		case 5: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[4]) << (sizeof(bitD->bitContainer) * 8 - 32);
+		case 4: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[3]) << 24;
+		case 3: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[2]) << 16;
+		case 2: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[1]) << 8;
+		default:;
+		}
+		{
+			BYTE const lastByte = ((const BYTE *)srcBuffer)[srcSize - 1];
+			bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+			if (lastByte == 0)
+				return ERROR(GENERIC); /* endMark not present */
+		}
+		bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize) * 8;
+	}
+
+	return srcSize;
+}
+
+ZSTD_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start) { return bitContainer >> start; }
+
+ZSTD_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) { return (bitContainer >> start) & BIT_mask[nbBits]; }
+
+ZSTD_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) { return bitContainer & BIT_mask[nbBits]; }
+
+/*! BIT_lookBits() :
+ *  Provides next n bits from local register.
+ *  local register is not modified.
+ *  On 32-bits, maxNbBits==24.
+ *  On 64-bits, maxNbBits==56.
+ *  @return : value extracted
+ */
+ZSTD_STATIC size_t BIT_lookBits(const BIT_DStream_t *bitD, U32 nbBits)
+{
+	U32 const bitMask = sizeof(bitD->bitContainer) * 8 - 1;
+	return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask - nbBits) & bitMask);
+}
+
+/*! BIT_lookBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+ZSTD_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t *bitD, U32 nbBits)
+{
+	U32 const bitMask = sizeof(bitD->bitContainer) * 8 - 1;
+	return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask + 1) - nbBits) & bitMask);
+}
+
+ZSTD_STATIC void BIT_skipBits(BIT_DStream_t *bitD, U32 nbBits) { bitD->bitsConsumed += nbBits; }
+
+/*! BIT_readBits() :
+ *  Read (consume) next n bits from local register and update.
+ *  Pay attention to not read more than nbBits contained into local register.
+ *  @return : extracted value.
+ */
+ZSTD_STATIC size_t BIT_readBits(BIT_DStream_t *bitD, U32 nbBits)
+{
+	size_t const value = BIT_lookBits(bitD, nbBits);
+	BIT_skipBits(bitD, nbBits);
+	return value;
+}
+
+/*! BIT_readBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+ZSTD_STATIC size_t BIT_readBitsFast(BIT_DStream_t *bitD, U32 nbBits)
+{
+	size_t const value = BIT_lookBitsFast(bitD, nbBits);
+	BIT_skipBits(bitD, nbBits);
+	return value;
+}
+
+/*! BIT_reloadDStream() :
+*   Refill `bitD` from buffer previously set in BIT_initDStream() .
+*   This function is safe, it guarantees it will not read beyond src buffer.
+*   @return : status of `BIT_DStream_t` internal register.
+			  if status == BIT_DStream_unfinished, internal register is filled with >= (sizeof(bitD->bitContainer)*8 - 7) bits */
+ZSTD_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t *bitD)
+{
+	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer) * 8)) /* should not happen => corruption detected */
+		return BIT_DStream_overflow;
+
+	if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
+		bitD->ptr -= bitD->bitsConsumed >> 3;
+		bitD->bitsConsumed &= 7;
+		bitD->bitContainer = ZSTD_readLEST(bitD->ptr);
+		return BIT_DStream_unfinished;
+	}
+	if (bitD->ptr == bitD->start) {
+		if (bitD->bitsConsumed < sizeof(bitD->bitContainer) * 8)
+			return BIT_DStream_endOfBuffer;
+		return BIT_DStream_completed;
+	}
+	{
+		U32 nbBytes = bitD->bitsConsumed >> 3;
+		BIT_DStream_status result = BIT_DStream_unfinished;
+		if (bitD->ptr - nbBytes < bitD->start) {
+			nbBytes = (U32)(bitD->ptr - bitD->start); /* ptr > start */
+			result = BIT_DStream_endOfBuffer;
+		}
+		bitD->ptr -= nbBytes;
+		bitD->bitsConsumed -= nbBytes * 8;
+		bitD->bitContainer = ZSTD_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD) */
+		return result;
+	}
+}
+
+/*! BIT_endOfDStream() :
+*   @return Tells if DStream has exactly reached its end (all bits consumed).
+*/
+ZSTD_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t *DStream)
+{
+	return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer) * 8));
+}
+
+#endif /* BITSTREAM_H_MODULE */
--- a/lib/zstd/decompress.c
+++ b/lib/zstd/decompress.c
--- a/lib/zstd/entropy_common.c
+++ b/lib/zstd/entropy_common.c
@ -0,0 +1,213 @@
+// SPDX-License-Identifier: (GPL-2.0 or BSD-2-Clause)
+/*
+ * Common functions of New Generation Entropy library
+ * Copyright (C) 2016, Yann Collet.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include "error_private.h" /* ERR_*, ERROR */
+#include "fse.h"
+#include "huf.h"
+#include "mem.h"
+
+/*===   Version   ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+/*===   Error Management   ===*/
+unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+
+unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_readNCount(short *normalizedCounter, unsigned *maxSVPtr, unsigned *tableLogPtr, const void *headerBuffer, size_t hbSize)
+{
+	const BYTE *const istart = (const BYTE *)headerBuffer;
+	const BYTE *const iend = istart + hbSize;
+	const BYTE *ip = istart;
+	int nbBits;
+	int remaining;
+	int threshold;
+	U32 bitStream;
+	int bitCount;
+	unsigned charnum = 0;
+	int previous0 = 0;
+
+	if (hbSize < 4)
+		return ERROR(srcSize_wrong);
+	bitStream = ZSTD_readLE32(ip);
+	nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG; /* extract tableLog */
+	if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX)
+		return ERROR(tableLog_tooLarge);
+	bitStream >>= 4;
+	bitCount = 4;
+	*tableLogPtr = nbBits;
+	remaining = (1 << nbBits) + 1;
+	threshold = 1 << nbBits;
+	nbBits++;
+
+	while ((remaining > 1) & (charnum <= *maxSVPtr)) {
+		if (previous0) {
+			unsigned n0 = charnum;
+			while ((bitStream & 0xFFFF) == 0xFFFF) {
+				n0 += 24;
+				if (ip < iend - 5) {
+					ip += 2;
+					bitStream = ZSTD_readLE32(ip) >> bitCount;
+				} else {
+					bitStream >>= 16;
+					bitCount += 16;
+				}
+			}
+			while ((bitStream & 3) == 3) {
+				n0 += 3;
+				bitStream >>= 2;
+				bitCount += 2;
+			}
+			n0 += bitStream & 3;
+			bitCount += 2;
+			if (n0 > *maxSVPtr)
+				return ERROR(maxSymbolValue_tooSmall);
+			while (charnum < n0)
+				normalizedCounter[charnum++] = 0;
+			if ((ip <= iend - 7) || (ip + (bitCount >> 3) <= iend - 4)) {
+				ip += bitCount >> 3;
+				bitCount &= 7;
+				bitStream = ZSTD_readLE32(ip) >> bitCount;
+			} else {
+				bitStream >>= 2;
+			}
+		}
+		{
+			int const max = (2 * threshold - 1) - remaining;
+			int count;
+
+			if ((bitStream & (threshold - 1)) < (U32)max) {
+				count = bitStream & (threshold - 1);
+				bitCount += nbBits - 1;
+			} else {
+				count = bitStream & (2 * threshold - 1);
+				if (count >= threshold)
+					count -= max;
+				bitCount += nbBits;
+			}
+
+			count--;				 /* extra accuracy */
+			remaining -= count < 0 ? -count : count; /* -1 means +1 */
+			normalizedCounter[charnum++] = (short)count;
+			previous0 = !count;
+			while (remaining < threshold) {
+				nbBits--;
+				threshold >>= 1;
+			}
+
+			if ((ip <= iend - 7) || (ip + (bitCount >> 3) <= iend - 4)) {
+				ip += bitCount >> 3;
+				bitCount &= 7;
+			} else {
+				bitCount -= (int)(8 * (iend - 4 - ip));
+				ip = iend - 4;
+			}
+			bitStream = ZSTD_readLE32(ip) >> (bitCount & 31);
+		}
+	} /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
+	if (remaining != 1)
+		return ERROR(corruption_detected);
+	if (bitCount > 32)
+		return ERROR(corruption_detected);
+	*maxSVPtr = charnum - 1;
+
+	ip += (bitCount + 7) >> 3;
+	return ip - istart;
+}
+
+/*! HUF_readStats() :
+	Read compact Huffman tree, saved by HUF_writeCTable().
+	`huffWeight` is destination buffer.
+	`rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+	@return : size read from `src` , or an error Code .
+	Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+	U32 weightTotal;
+	const BYTE *ip = (const BYTE *)src;
+	size_t iSize;
+	size_t oSize;
+
+	if (!srcSize)
+		return ERROR(srcSize_wrong);
+	iSize = ip[0];
+	/* memset(huffWeight, 0, hwSize);   */ /* is not necessary, even though some analyzer complain ... */
+
+	if (iSize >= 128) { /* special header */
+		oSize = iSize - 127;
+		iSize = ((oSize + 1) / 2);
+		if (iSize + 1 > srcSize)
+			return ERROR(srcSize_wrong);
+		if (oSize >= hwSize)
+			return ERROR(corruption_detected);
+		ip += 1;
+		{
+			U32 n;
+			for (n = 0; n < oSize; n += 2) {
+				huffWeight[n] = ip[n / 2] >> 4;
+				huffWeight[n + 1] = ip[n / 2] & 15;
+			}
+		}
+	} else {						 /* header compressed with FSE (normal case) */
+		if (iSize + 1 > srcSize)
+			return ERROR(srcSize_wrong);
+		oSize = FSE_decompress_wksp(huffWeight, hwSize - 1, ip + 1, iSize, 6, workspace, workspaceSize); /* max (hwSize-1) values decoded, as last one is implied */
+		if (FSE_isError(oSize))
+			return oSize;
+	}
+
+	/* collect weight stats */
+	memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+	weightTotal = 0;
+	{
+		U32 n;
+		for (n = 0; n < oSize; n++) {
+			if (huffWeight[n] >= HUF_TABLELOG_MAX)
+				return ERROR(corruption_detected);
+			rankStats[huffWeight[n]]++;
+			weightTotal += (1 << huffWeight[n]) >> 1;
+		}
+	}
+	if (weightTotal == 0)
+		return ERROR(corruption_detected);
+
+	/* get last non-null symbol weight (implied, total must be 2^n) */
+	{
+		U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+		if (tableLog > HUF_TABLELOG_MAX)
+			return ERROR(corruption_detected);
+		*tableLogPtr = tableLog;
+		/* determine last weight */
+		{
+			U32 const total = 1 << tableLog;
+			U32 const rest = total - weightTotal;
+			U32 const verif = 1 << BIT_highbit32(rest);
+			U32 const lastWeight = BIT_highbit32(rest) + 1;
+			if (verif != rest)
+				return ERROR(corruption_detected); /* last value must be a clean power of 2 */
+			huffWeight[oSize] = (BYTE)lastWeight;
+			rankStats[lastWeight]++;
+		}
+	}
+
+	/* check tree construction validity */
+	if ((rankStats[1] < 2) || (rankStats[1] & 1))
+		return ERROR(corruption_detected); /* by construction : at least 2 elts of rank 1, must be even */
+
+	/* results */
+	*nbSymbolsPtr = (U32)(oSize + 1);
+	return iSize + 1;
+}
--- a/lib/zstd/error_private.h
+++ b/lib/zstd/error_private.h
@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: (GPL-2.0 or BSD-3-Clause-Clear) */
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+/* ****************************************
+*  Dependencies
+******************************************/
+#include <linux/types.h> /* size_t */
+#include <linux/zstd.h>  /* enum list */
+
+/* ****************************************
+*  Compiler-specific
+******************************************/
+#define ERR_STATIC static __attribute__((unused))
+
+/*-****************************************
+*  Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+/*-****************************************
+*  Error codes handling
+******************************************/
+#define ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code)
+{
+	if (!ERR_isError(code))
+		return (ERR_enum)0;
+	return (ERR_enum)(0 - code);
+}
+
+#endif /* ERROR_H_MODULE */
--- a/lib/zstd/fse.h
+++ b/lib/zstd/fse.h
@ -0,0 +1,545 @@
+/* SPDX-License-Identifier: (GPL-2.0 or BSD-2-Clause) */
+/*
+ * FSE : Finite State Entropy codec
+ * Public Prototypes declaration
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+#ifndef FSE_H
+#define FSE_H
+
+/*-*****************************************
+*  Dependencies
+******************************************/
+#include <linux/types.h> /* size_t, ptrdiff_t */
+
+/*-*****************************************
+*  FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#define FSE_PUBLIC_API
+
+/*------   Version   ------*/
+#define FSE_VERSION_MAJOR 0
+#define FSE_VERSION_MINOR 9
+#define FSE_VERSION_RELEASE 0
+
+#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
+#define FSE_QUOTE(str) #str
+#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
+#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
+
+#define FSE_VERSION_NUMBER (FSE_VERSION_MAJOR * 100 * 100 + FSE_VERSION_MINOR * 100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void); /**< library version number; to be used when checking dll version */
+
+/*-*****************************************
+*  Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size); /* maximum compressed size */
+
+/* Error Management */
+FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return value is an error code */
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[]
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+/* *** COMPRESSION *** */
+/*! FSE_optimalTableLog():
+	dynamically downsize 'tableLog' when conditions are met.
+	It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+	@return : recommended tableLog (necessarily <= 'maxTableLog') */
+FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_normalizeCount():
+	normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
+	'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+	@return : tableLog,
+			  or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_normalizeCount(short *normalizedCounter, unsigned tableLog, const unsigned *count, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_NCountWriteBound():
+	Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
+	Typically useful for allocation purpose. */
+FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_writeNCount():
+	Compactly save 'normalizedCounter' into 'buffer'.
+	@return : size of the compressed table,
+			  or an errorCode, which can be tested using FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_writeNCount(void *buffer, size_t bufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! Constructor and Destructor of FSE_CTable.
+	Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */
+
+/*! FSE_compress_usingCTable():
+	Compress `src` using `ct` into `dst` which must be already allocated.
+	@return : size of compressed data (<= `dstCapacity`),
+			  or 0 if compressed data could not fit into `dst`,
+			  or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_compress_usingCTable(void *dst, size_t dstCapacity, const void *src, size_t srcSize, const FSE_CTable *ct);
+
+/*!
+Tutorial :
+----------
+The first step is to count all symbols. FSE_count() does this job very fast.
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
+FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+The next step is to normalize the frequencies.
+FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
+which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
+
+The result of FSE_normalizeCount() will be saved into a table,
+called 'normalizedCounter', which is a table of signed short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
+The return value is tableLog if everything proceeded as expected.
+It is 0 if there is a single symbol within distribution.
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
+For guaranteed success, buffer size must be at least FSE_headerBound().
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
+
+'normalizedCounter' can then be used to create the compression table 'CTable'.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
+You can then use FSE_buildCTable() to fill 'CTable'.
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
+
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header), necessarily <= `dstCapacity`.
+If it returns '0', compressed data could not fit into 'dst'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+*/
+
+/* *** DECOMPRESSION *** */
+
+/*! FSE_readNCount():
+	Read compactly saved 'normalizedCounter' from 'rBuffer'.
+	@return : size read from 'rBuffer',
+			  or an errorCode, which can be tested using FSE_isError().
+			  maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+FSE_PUBLIC_API size_t FSE_readNCount(short *normalizedCounter, unsigned *maxSymbolValuePtr, unsigned *tableLogPtr, const void *rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSE_DTable.
+	Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */
+
+/*! FSE_buildDTable():
+	Builds 'dt', which must be already allocated, using FSE_createDTable().
+	return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize);
+
+/*! FSE_decompress_usingDTable():
+	Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+	into `dst` which must be already allocated.
+	@return : size of regenerated data (necessarily <= `dstCapacity`),
+			  or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+/* *** Dependency *** */
+#include "bitstream.h"
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size >> 7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size)) /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1 << (maxTableLog - 1)) + ((maxSymbolValue + 1) * 2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1 << maxTableLog))
+
+/* *****************************************
+*  FSE advanced API
+*******************************************/
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned
+ */
+size_t FSE_count_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace);
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` must be a table of minimum `1024` unsigned
+ */
+size_t FSE_countFast_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize, unsigned *workSpace);
+
+/*! FSE_count_simple
+ * Same as FSE_countFast(), but does not use any additional memory (not even on stack).
+ * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` (presuming it's also the size of `count`).
+*/
+size_t FSE_count_simple(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize);
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSE_optimalTableLog(), which used `minus==2` */
+
+size_t FSE_buildCTable_raw(FSE_CTable *ct, unsigned nbBits);
+/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+
+size_t FSE_buildCTable_rle(FSE_CTable *ct, unsigned char symbolValue);
+/**< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` must be >= `(1<<tableLog)`.
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable *ct, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workSpace, size_t wkspSize);
+
+size_t FSE_buildDTable_raw(FSE_DTable *dt, unsigned nbBits);
+/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+
+size_t FSE_buildDTable_rle(FSE_DTable *dt, unsigned char symbolValue);
+/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize);
+/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
+
+/* *****************************************
+*  FSE symbol compression API
+*******************************************/
+/*!
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   Hence their body are included in next section.
+*/
+typedef struct {
+	ptrdiff_t value;
+	const void *stateTable;
+	const void *symbolTT;
+	unsigned stateLog;
+} FSE_CState_t;
+
+static void FSE_initCState(FSE_CState_t *CStatePtr, const FSE_CTable *ct);
+
+static void FSE_encodeSymbol(BIT_CStream_t *bitC, FSE_CState_t *CStatePtr, unsigned symbol);
+
+static void FSE_flushCState(BIT_CStream_t *bitC, const FSE_CState_t *CStatePtr);
+
+/**<
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable    ct;         // Provided by FSE_buildCTable()
+BIT_CStream_t bitStream;  // bitStream tracking structure
+FSE_CState_t  state;      // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+	size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize);
+	FSE_initCState(&state, ct);
+
+Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
+You can then encode your input data, byte after byte.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+	FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+	BIT_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+	BIT_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+	FSE_flushState(&bitStream, &state);
+
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+	size_t size = BIT_closeCStream(&bitStream);
+*/
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct {
+	size_t state;
+	const void *table; /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+static void FSE_initDState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD, const FSE_DTable *dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t *DStatePtr);
+
+/**<
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+BIT_DStream_t DStream;    // Stream context
+FSE_DState_t  DState;     // State context. Multiple ones are possible
+FSE_DTable*   DTablePtr;  // Decoding table, provided by FSE_buildDTable()
+
+The first thing to do is to init the bitStream.
+	errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+	errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+	unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+	size_t bitField = BIT_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size depends on size_t).
+Refueling the register from memory is manually performed by the reload method.
+	endSignal = FSE_reloadDStream(&DStream);
+
+BIT_reloadDStream() result tells if there is still some more data to read from DStream.
+BIT_DStream_unfinished : there is still some data left into the DStream.
+BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+	BIT_reloadDStream(&DStream) >= BIT_DStream_completed
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+	BIT_endOfDStream(&DStream);
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
+	FSE_endOfDState(&DState);
+*/
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+typedef struct {
+	int deltaFindState;
+	U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+ZSTD_STATIC void FSE_initCState(FSE_CState_t *statePtr, const FSE_CTable *ct)
+{
+	const void *ptr = ct;
+	const U16 *u16ptr = (const U16 *)ptr;
+	const U32 tableLog = ZSTD_read16(ptr);
+	statePtr->value = (ptrdiff_t)1 << tableLog;
+	statePtr->stateTable = u16ptr + 2;
+	statePtr->symbolTT = ((const U32 *)ct + 1 + (tableLog ? (1 << (tableLog - 1)) : 1));
+	statePtr->stateLog = tableLog;
+}
+
+/*! FSE_initCState2() :
+*   Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+*   uses the smallest state value possible, saving the cost of this symbol */
+ZSTD_STATIC void FSE_initCState2(FSE_CState_t *statePtr, const FSE_CTable *ct, U32 symbol)
+{
+	FSE_initCState(statePtr, ct);
+	{
+		const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform *)(statePtr->symbolTT))[symbol];
+		const U16 *stateTable = (const U16 *)(statePtr->stateTable);
+		U32 nbBitsOut = (U32)((symbolTT.deltaNbBits + (1 << 15)) >> 16);
+		statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+		statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+	}
+}
+
+ZSTD_STATIC void FSE_encodeSymbol(BIT_CStream_t *bitC, FSE_CState_t *statePtr, U32 symbol)
+{
+	const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform *)(statePtr->symbolTT))[symbol];
+	const U16 *const stateTable = (const U16 *)(statePtr->stateTable);
+	U32 nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+	BIT_addBits(bitC, statePtr->value, nbBitsOut);
+	statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+ZSTD_STATIC void FSE_flushCState(BIT_CStream_t *bitC, const FSE_CState_t *statePtr)
+{
+	BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+	BIT_flushBits(bitC);
+}
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+	U16 tableLog;
+	U16 fastMode;
+} FSE_DTableHeader; /* sizeof U32 */
+
+typedef struct {
+	unsigned short newState;
+	unsigned char symbol;
+	unsigned char nbBits;
+} FSE_decode_t; /* size == U32 */
+
+ZSTD_STATIC void FSE_initDState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD, const FSE_DTable *dt)
+{
+	const void *ptr = dt;
+	const FSE_DTableHeader *const DTableH = (const FSE_DTableHeader *)ptr;
+	DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+	BIT_reloadDStream(bitD);
+	DStatePtr->table = dt + 1;
+}
+
+ZSTD_STATIC BYTE FSE_peekSymbol(const FSE_DState_t *DStatePtr)
+{
+	FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+	return DInfo.symbol;
+}
+
+ZSTD_STATIC void FSE_updateState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD)
+{
+	FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+	U32 const nbBits = DInfo.nbBits;
+	size_t const lowBits = BIT_readBits(bitD, nbBits);
+	DStatePtr->state = DInfo.newState + lowBits;
+}
+
+ZSTD_STATIC BYTE FSE_decodeSymbol(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD)
+{
+	FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+	U32 const nbBits = DInfo.nbBits;
+	BYTE const symbol = DInfo.symbol;
+	size_t const lowBits = BIT_readBits(bitD, nbBits);
+
+	DStatePtr->state = DInfo.newState + lowBits;
+	return symbol;
+}
+
+/*! FSE_decodeSymbolFast() :
+	unsafe, only works if no symbol has a probability > 50% */
+ZSTD_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD)
+{
+	FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+	U32 const nbBits = DInfo.nbBits;
+	BYTE const symbol = DInfo.symbol;
+	size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+	DStatePtr->state = DInfo.newState + lowBits;
+	return symbol;
+}
+
+ZSTD_STATIC unsigned FSE_endOfDState(const FSE_DState_t *DStatePtr) { return DStatePtr->state == 0; }
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#ifndef FSE_MAX_MEMORY_USAGE
+#define FSE_MAX_MEMORY_USAGE 14
+#endif
+#ifndef FSE_DEFAULT_MEMORY_USAGE
+#define FSE_DEFAULT_MEMORY_USAGE 13
+#endif
+
+/*!FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#ifndef FSE_MAX_SYMBOL_VALUE
+#define FSE_MAX_SYMBOL_VALUE 255
+#endif
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG (FSE_MAX_MEMORY_USAGE - 2)
+#define FSE_MAX_TABLESIZE (1U << FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE - 1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE - 2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSE_TABLESTEP(tableSize) ((tableSize >> 1) + (tableSize >> 3) + 3)
+
+#endif /* FSE_H */
--- a/lib/zstd/fse_decompress.c
+++ b/lib/zstd/fse_decompress.c
@ -0,0 +1,302 @@
+// SPDX-License-Identifier: (GPL-2.0 or BSD-2-Clause)
+/*
+ * FSE : Finite State Entropy decoder
+ * Copyright (C) 2013-2015, Yann Collet.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#define FORCE_INLINE static __always_inline
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "bitstream.h"
+#include "fse.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c)                                   \
+	{                                                      \
+		enum { FSE_static_assert = 1 / (int)(!!(c)) }; \
+	} /* use only *after* variable declarations */
+
+/* check and forward error code */
+#define CHECK_F(f)                  \
+	{                           \
+		size_t const e = f; \
+		if (FSE_isError(e)) \
+			return e;   \
+	}
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X, Y) X##Y
+#define FSE_FUNCTION_NAME(X, Y) FSE_CAT(X, Y)
+#define FSE_TYPE_NAME(X, Y) FSE_CAT(X, Y)
+
+/* Function templates */
+
+size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize)
+{
+	void *const tdPtr = dt + 1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
+	FSE_DECODE_TYPE *const tableDecode = (FSE_DECODE_TYPE *)(tdPtr);
+	U16 *symbolNext = (U16 *)workspace;
+
+	U32 const maxSV1 = maxSymbolValue + 1;
+	U32 const tableSize = 1 << tableLog;
+	U32 highThreshold = tableSize - 1;
+
+	/* Sanity Checks */
+	if (workspaceSize < sizeof(U16) * (FSE_MAX_SYMBOL_VALUE + 1))
+		return ERROR(tableLog_tooLarge);
+	if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE)
+		return ERROR(maxSymbolValue_tooLarge);
+	if (tableLog > FSE_MAX_TABLELOG)
+		return ERROR(tableLog_tooLarge);
+
+	/* Init, lay down lowprob symbols */
+	{
+		FSE_DTableHeader DTableH;
+		DTableH.tableLog = (U16)tableLog;
+		DTableH.fastMode = 1;
+		{
+			S16 const largeLimit = (S16)(1 << (tableLog - 1));
+			U32 s;
+			for (s = 0; s < maxSV1; s++) {
+				if (normalizedCounter[s] == -1) {
+					tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+					symbolNext[s] = 1;
+				} else {
+					if (normalizedCounter[s] >= largeLimit)
+						DTableH.fastMode = 0;
+					symbolNext[s] = normalizedCounter[s];
+				}
+			}
+		}
+		memcpy(dt, &DTableH, sizeof(DTableH));
+	}
+
+	/* Spread symbols */
+	{
+		U32 const tableMask = tableSize - 1;
+		U32 const step = FSE_TABLESTEP(tableSize);
+		U32 s, position = 0;
+		for (s = 0; s < maxSV1; s++) {
+			int i;
+			for (i = 0; i < normalizedCounter[s]; i++) {
+				tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+				position = (position + step) & tableMask;
+				while (position > highThreshold)
+					position = (position + step) & tableMask; /* lowprob area */
+			}
+		}
+		if (position != 0)
+			return ERROR(GENERIC); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+	}
+
+	/* Build Decoding table */
+	{
+		U32 u;
+		for (u = 0; u < tableSize; u++) {
+			FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+			U16 nextState = symbolNext[symbol]++;
+			tableDecode[u].nbBits = (BYTE)(tableLog - BIT_highbit32((U32)nextState));
+			tableDecode[u].newState = (U16)((nextState << tableDecode[u].nbBits) - tableSize);
+		}
+	}
+
+	return 0;
+}
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSE_buildDTable_rle(FSE_DTable *dt, BYTE symbolValue)
+{
+	void *ptr = dt;
+	FSE_DTableHeader *const DTableH = (FSE_DTableHeader *)ptr;
+	void *dPtr = dt + 1;
+	FSE_decode_t *const cell = (FSE_decode_t *)dPtr;
+
+	DTableH->tableLog = 0;
+	DTableH->fastMode = 0;
+
+	cell->newState = 0;
+	cell->symbol = symbolValue;
+	cell->nbBits = 0;
+
+	return 0;
+}
+
+size_t FSE_buildDTable_raw(FSE_DTable *dt, unsigned nbBits)
+{
+	void *ptr = dt;
+	FSE_DTableHeader *const DTableH = (FSE_DTableHeader *)ptr;
+	void *dPtr = dt + 1;
+	FSE_decode_t *const dinfo = (FSE_decode_t *)dPtr;
+	const unsigned tableSize = 1 << nbBits;
+	const unsigned tableMask = tableSize - 1;
+	const unsigned maxSV1 = tableMask + 1;
+	unsigned s;
+
+	/* Sanity checks */
+	if (nbBits < 1)
+		return ERROR(GENERIC); /* min size */
+
+	/* Build Decoding Table */
+	DTableH->tableLog = (U16)nbBits;
+	DTableH->fastMode = 1;
+	for (s = 0; s < maxSV1; s++) {
+		dinfo[s].newState = 0;
+		dinfo[s].symbol = (BYTE)s;
+		dinfo[s].nbBits = (BYTE)nbBits;
+	}
+
+	return 0;
+}
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt,
+						       const unsigned fast)
+{
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *op = ostart;
+	BYTE *const omax = op + maxDstSize;
+	BYTE *const olimit = omax - 3;
+
+	BIT_DStream_t bitD;
+	FSE_DState_t state1;
+	FSE_DState_t state2;
+
+	/* Init */
+	CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+	FSE_initDState(&state1, &bitD, dt);
+	FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+	/* 4 symbols per loop */
+	for (; (BIT_reloadDStream(&bitD) == BIT_DStream_unfinished) & (op < olimit); op += 4) {
+		op[0] = FSE_GETSYMBOL(&state1);
+
+		if (FSE_MAX_TABLELOG * 2 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
+			BIT_reloadDStream(&bitD);
+
+		op[1] = FSE_GETSYMBOL(&state2);
+
+		if (FSE_MAX_TABLELOG * 4 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
+		{
+			if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) {
+				op += 2;
+				break;
+			}
+		}
+
+		op[2] = FSE_GETSYMBOL(&state1);
+
+		if (FSE_MAX_TABLELOG * 2 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
+			BIT_reloadDStream(&bitD);
+
+		op[3] = FSE_GETSYMBOL(&state2);
+	}
+
+	/* tail */
+	/* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+	while (1) {
+		if (op > (omax - 2))
+			return ERROR(dstSize_tooSmall);
+		*op++ = FSE_GETSYMBOL(&state1);
+		if (BIT_reloadDStream(&bitD) == BIT_DStream_overflow) {
+			*op++ = FSE_GETSYMBOL(&state2);
+			break;
+		}
+
+		if (op > (omax - 2))
+			return ERROR(dstSize_tooSmall);
+		*op++ = FSE_GETSYMBOL(&state2);
+		if (BIT_reloadDStream(&bitD) == BIT_DStream_overflow) {
+			*op++ = FSE_GETSYMBOL(&state1);
+			break;
+		}
+	}
+
+	return op - ostart;
+}
+
+size_t FSE_decompress_usingDTable(void *dst, size_t originalSize, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt)
+{
+	const void *ptr = dt;
+	const FSE_DTableHeader *DTableH = (const FSE_DTableHeader *)ptr;
+	const U32 fastMode = DTableH->fastMode;
+
+	/* select fast mode (static) */
+	if (fastMode)
+		return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+	return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize)
+{
+	const BYTE *const istart = (const BYTE *)cSrc;
+	const BYTE *ip = istart;
+	unsigned tableLog;
+	unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+	size_t NCountLength;
+
+	FSE_DTable *dt;
+	short *counting;
+	size_t spaceUsed32 = 0;
+
+	FSE_STATIC_ASSERT(sizeof(FSE_DTable) == sizeof(U32));
+
+	dt = (FSE_DTable *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += FSE_DTABLE_SIZE_U32(maxLog);
+	counting = (short *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(short) * (FSE_MAX_SYMBOL_VALUE + 1), sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	/* normal FSE decoding mode */
+	NCountLength = FSE_readNCount(counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+	if (FSE_isError(NCountLength))
+		return NCountLength;
+	// if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size; supposed to be already checked in NCountLength, only remaining
+	// case : NCountLength==cSrcSize */
+	if (tableLog > maxLog)
+		return ERROR(tableLog_tooLarge);
+	ip += NCountLength;
+	cSrcSize -= NCountLength;
+
+	CHECK_F(FSE_buildDTable_wksp(dt, counting, maxSymbolValue, tableLog, workspace, workspaceSize));
+
+	return FSE_decompress_usingDTable(dst, dstCapacity, ip, cSrcSize, dt); /* always return, even if it is an error code */
+}
--- a/lib/zstd/huf.h
+++ b/lib/zstd/huf.h
@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: (GPL-2.0 or BSD-2-Clause) */
+/*
+ * Huffman coder, part of New Generation Entropy library
+ * header file
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+#ifndef HUF_H_298734234
+#define HUF_H_298734234
+
+/* *** Dependencies *** */
+#include <linux/types.h> /* size_t */
+
+/* ***   Tool functions *** */
+#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */
+size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */
+
+/* Error Management */
+unsigned HUF_isError(size_t code); /**< tells if a return value is an error code */
+
+/* ***   Advanced function   *** */
+
+/** HUF_compress4X_wksp() :
+*   Same as HUF_compress2(), but uses externally allocated `workSpace`, which must be a table of >= 1024 unsigned */
+size_t HUF_compress4X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+
+/* *** Dependencies *** */
+#include "mem.h" /* U32 */
+
+/* *** Constants *** */
+#define HUF_TABLELOG_MAX 12     /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_TABLELOG_DEFAULT 11 /* tableLog by default, when not specified */
+#define HUF_SYMBOLVALUE_MAX 255
+
+#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+#error "HUF_TABLELOG_MAX is too large !"
+#endif
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size >> 8) + 8)			 /* only true if incompressible pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size)) /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's Compression Table */
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+	U32 name##hb[maxSymbolValue + 1];              \
+	void *name##hv = &(name##hb);                  \
+	HUF_CElt *name = (HUF_CElt *)(name##hv) /* no final ; */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUF_DTable;
+#define HUF_DTABLE_SIZE(maxTableLog) (1 + (1 << (maxTableLog)))
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = {((U32)((maxTableLog)-1) * 0x01000001)}
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = {((U32)(maxTableLog)*0x01000001)}
+
+/* The workspace must have alignment at least 4 and be at least this large */
+#define HUF_COMPRESS_WORKSPACE_SIZE (6 << 10)
+#define HUF_COMPRESS_WORKSPACE_SIZE_U32 (HUF_COMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+/* The workspace must have alignment at least 4 and be at least this large */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE (3 << 10)
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize); /**< decodes RLE and uncompressed */
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				size_t workspaceSize);							       /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< single-symbol decoder */
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< double-symbols decoder */
+
+/* ****************************************
+*  HUF detailed API
+******************************************/
+/*!
+HUF_compress() does the following:
+1. count symbol occurrence from source[] into table count[] using FSE_count()
+2. (optional) refine tableLog using HUF_optimalTableLog()
+3. build Huffman table from count using HUF_buildCTable()
+4. save Huffman table to memory buffer using HUF_writeCTable_wksp()
+5. encode the data stream using HUF_compress4X_usingCTable()
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and regenerate 'CTable' using external methods.
+*/
+/* FSE_count() : find it within "fse.h" */
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */
+size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, unsigned maxSymbolValue, unsigned huffLog, void *workspace, size_t workspaceSize);
+size_t HUF_compress4X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable);
+
+typedef enum {
+	HUF_repeat_none,  /**< Cannot use the previous table */
+	HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1,
+			     4}X_repeat */
+	HUF_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
+} HUF_repeat;
+/** HUF_compress4X_repeat() :
+*   Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+*   If it uses hufTable it does not modify hufTable or repeat.
+*   If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+*   If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress4X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat,
+			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned.
+ */
+size_t HUF_buildCTable_wksp(HUF_CElt *tree, const U32 *count, U32 maxSymbolValue, U32 maxNbBits, void *workSpace, size_t wkspSize);
+
+/*! HUF_readStats() :
+	Read compact Huffman tree, saved by HUF_writeCTable().
+	`huffWeight` is destination buffer.
+	@return : size read from `src` , or an error Code .
+	Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize,
+			  void *workspace, size_t workspaceSize);
+
+/** HUF_readCTable() :
+*   Loading a CTable saved with HUF_writeCTable() */
+size_t HUF_readCTable_wksp(HUF_CElt *CTable, unsigned maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+
+/*
+HUF_decompress() does the following:
+1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
+2. build Huffman table from save, using HUF_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable
+*/
+
+/** HUF_selectDecoder() :
+*   Tells which decoder is likely to decode faster,
+*   based on a set of pre-determined metrics.
+*   @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize);
+
+size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+
+size_t HUF_decompress4X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+size_t HUF_decompress4X2_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+size_t HUF_decompress4X4_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+
+/* single stream variants */
+
+size_t HUF_compress1X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+size_t HUF_compress1X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable);
+/** HUF_compress1X_repeat() :
+*   Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+*   If it uses hufTable it does not modify hufTable or repeat.
+*   If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+*   If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat,
+			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize);
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< single-symbol decoder */
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< double-symbols decoder */
+
+size_t HUF_decompress1X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize,
+				    const HUF_DTable *DTable); /**< automatic selection of sing or double symbol decoder, based on DTable */
+size_t HUF_decompress1X2_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+size_t HUF_decompress1X4_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+
+#endif /* HUF_H_298734234 */
--- a/lib/zstd/huf_decompress.c
+++ b/lib/zstd/huf_decompress.c
@ -0,0 +1,930 @@
+// SPDX-License-Identifier: (GPL-2.0 or BSD-2-Clause)
+/*
+ * Huffman decoder, part of New Generation Entropy library
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#define FORCE_INLINE static __always_inline
+
+/* **************************************************************
+*  Dependencies
+****************************************************************/
+#include "bitstream.h" /* BIT_* */
+#include "fse.h"       /* header compression */
+#include "huf.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c)                                   \
+	{                                                      \
+		enum { HUF_static_assert = 1 / (int)(!!(c)) }; \
+	} /* use only *after* variable declarations */
+
+/*-***************************/
+/*  generic DTableDesc       */
+/*-***************************/
+
+typedef struct {
+	BYTE maxTableLog;
+	BYTE tableType;
+	BYTE tableLog;
+	BYTE reserved;
+} DTableDesc;
+
+static DTableDesc HUF_getDTableDesc(const HUF_DTable *table)
+{
+	DTableDesc dtd;
+	memcpy(&dtd, table, sizeof(dtd));
+	return dtd;
+}
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+
+typedef struct {
+	BYTE byte;
+	BYTE nbBits;
+} HUF_DEltX2; /* single-symbol decoding */
+
+size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+	U32 tableLog = 0;
+	U32 nbSymbols = 0;
+	size_t iSize;
+	void *const dtPtr = DTable + 1;
+	HUF_DEltX2 *const dt = (HUF_DEltX2 *)dtPtr;
+
+	U32 *rankVal;
+	BYTE *huffWeight;
+	size_t spaceUsed32 = 0;
+
+	rankVal = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+	/* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
+
+	iSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
+	if (HUF_isError(iSize))
+		return iSize;
+
+	/* Table header */
+	{
+		DTableDesc dtd = HUF_getDTableDesc(DTable);
+		if (tableLog > (U32)(dtd.maxTableLog + 1))
+			return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
+		dtd.tableType = 0;
+		dtd.tableLog = (BYTE)tableLog;
+		memcpy(DTable, &dtd, sizeof(dtd));
+	}
+
+	/* Calculate starting value for each rank */
+	{
+		U32 n, nextRankStart = 0;
+		for (n = 1; n < tableLog + 1; n++) {
+			U32 const curr = nextRankStart;
+			nextRankStart += (rankVal[n] << (n - 1));
+			rankVal[n] = curr;
+		}
+	}
+
+	/* fill DTable */
+	{
+		U32 n;
+		for (n = 0; n < nbSymbols; n++) {
+			U32 const w = huffWeight[n];
+			U32 const length = (1 << w) >> 1;
+			U32 u;
+			HUF_DEltX2 D;
+			D.byte = (BYTE)n;
+			D.nbBits = (BYTE)(tableLog + 1 - w);
+			for (u = rankVal[w]; u < rankVal[w] + length; u++)
+				dt[u] = D;
+			rankVal[w] += length;
+		}
+	}
+
+	return iSize;
+}
+
+static BYTE HUF_decodeSymbolX2(BIT_DStream_t *Dstream, const HUF_DEltX2 *dt, const U32 dtLog)
+{
+	size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+	BYTE const c = dt[val].byte;
+	BIT_skipBits(Dstream, dt[val].nbBits);
+	return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)         \
+	if (ZSTD_64bits() || (HUF_TABLELOG_MAX <= 12)) \
+	HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+	if (ZSTD_64bits())                     \
+	HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+FORCE_INLINE size_t HUF_decodeStreamX2(BYTE *p, BIT_DStream_t *const bitDPtr, BYTE *const pEnd, const HUF_DEltX2 *const dt, const U32 dtLog)
+{
+	BYTE *const pStart = p;
+
+	/* up to 4 symbols at a time */
+	while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd - 4)) {
+		HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+		HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+		HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+		HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+	}
+
+	/* closer to the end */
+	while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
+		HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+	/* no more data to retrieve from bitstream, hence no need to reload */
+	while (p < pEnd)
+		HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+	return pEnd - pStart;
+}
+
+static size_t HUF_decompress1X2_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	BYTE *op = (BYTE *)dst;
+	BYTE *const oend = op + dstSize;
+	const void *dtPtr = DTable + 1;
+	const HUF_DEltX2 *const dt = (const HUF_DEltX2 *)dtPtr;
+	BIT_DStream_t bitD;
+	DTableDesc const dtd = HUF_getDTableDesc(DTable);
+	U32 const dtLog = dtd.tableLog;
+
+	{
+		size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
+		if (HUF_isError(errorCode))
+			return errorCode;
+	}
+
+	HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+	/* check */
+	if (!BIT_endOfDStream(&bitD))
+		return ERROR(corruption_detected);
+
+	return dstSize;
+}
+
+size_t HUF_decompress1X2_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	if (dtd.tableType != 0)
+		return ERROR(GENERIC);
+	return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	const BYTE *ip = (const BYTE *)cSrc;
+
+	size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize);
+	if (HUF_isError(hSize))
+		return hSize;
+	if (hSize >= cSrcSize)
+		return ERROR(srcSize_wrong);
+	ip += hSize;
+	cSrcSize -= hSize;
+
+	return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+static size_t HUF_decompress4X2_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	/* Check */
+	if (cSrcSize < 10)
+		return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
+
+	{
+		const BYTE *const istart = (const BYTE *)cSrc;
+		BYTE *const ostart = (BYTE *)dst;
+		BYTE *const oend = ostart + dstSize;
+		const void *const dtPtr = DTable + 1;
+		const HUF_DEltX2 *const dt = (const HUF_DEltX2 *)dtPtr;
+
+		/* Init */
+		BIT_DStream_t bitD1;
+		BIT_DStream_t bitD2;
+		BIT_DStream_t bitD3;
+		BIT_DStream_t bitD4;
+		size_t const length1 = ZSTD_readLE16(istart);
+		size_t const length2 = ZSTD_readLE16(istart + 2);
+		size_t const length3 = ZSTD_readLE16(istart + 4);
+		size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+		const BYTE *const istart1 = istart + 6; /* jumpTable */
+		const BYTE *const istart2 = istart1 + length1;
+		const BYTE *const istart3 = istart2 + length2;
+		const BYTE *const istart4 = istart3 + length3;
+		const size_t segmentSize = (dstSize + 3) / 4;
+		BYTE *const opStart2 = ostart + segmentSize;
+		BYTE *const opStart3 = opStart2 + segmentSize;
+		BYTE *const opStart4 = opStart3 + segmentSize;
+		BYTE *op1 = ostart;
+		BYTE *op2 = opStart2;
+		BYTE *op3 = opStart3;
+		BYTE *op4 = opStart4;
+		U32 endSignal;
+		DTableDesc const dtd = HUF_getDTableDesc(DTable);
+		U32 const dtLog = dtd.tableLog;
+
+		if (length4 > cSrcSize)
+			return ERROR(corruption_detected); /* overflow */
+		{
+			size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+
+		/* 16-32 symbols per loop (4-8 symbols per stream) */
+		endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+		for (; (endSignal == BIT_DStream_unfinished) && (op4 < (oend - 7));) {
+			HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+			HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+			HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+			HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+			HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+			HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+			HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+			HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+			HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+			HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+			HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+			HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+			HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+			HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+			HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+			HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+			endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+		}
+
+		/* check corruption */
+		if (op1 > opStart2)
+			return ERROR(corruption_detected);
+		if (op2 > opStart3)
+			return ERROR(corruption_detected);
+		if (op3 > opStart4)
+			return ERROR(corruption_detected);
+		/* note : op4 supposed already verified within main loop */
+
+		/* finish bitStreams one by one */
+		HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+		HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+		HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+		HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog);
+
+		/* check */
+		endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+		if (!endSignal)
+			return ERROR(corruption_detected);
+
+		/* decoded size */
+		return dstSize;
+	}
+}
+
+size_t HUF_decompress4X2_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	if (dtd.tableType != 0)
+		return ERROR(GENERIC);
+	return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	const BYTE *ip = (const BYTE *)cSrc;
+
+	size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize);
+	if (HUF_isError(hSize))
+		return hSize;
+	if (hSize >= cSrcSize)
+		return ERROR(srcSize_wrong);
+	ip += hSize;
+	cSrcSize -= hSize;
+
+	return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
+}
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+typedef struct {
+	U16 sequence;
+	BYTE nbBits;
+	BYTE length;
+} HUF_DEltX4; /* double-symbols decoding */
+
+typedef struct {
+	BYTE symbol;
+	BYTE weight;
+} sortedSymbol_t;
+
+/* HUF_fillDTableX4Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
+static void HUF_fillDTableX4Level2(HUF_DEltX4 *DTable, U32 sizeLog, const U32 consumed, const U32 *rankValOrigin, const int minWeight,
+				   const sortedSymbol_t *sortedSymbols, const U32 sortedListSize, U32 nbBitsBaseline, U16 baseSeq)
+{
+	HUF_DEltX4 DElt;
+	U32 rankVal[HUF_TABLELOG_MAX + 1];
+
+	/* get pre-calculated rankVal */
+	memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+	/* fill skipped values */
+	if (minWeight > 1) {
+		U32 i, skipSize = rankVal[minWeight];
+		ZSTD_writeLE16(&(DElt.sequence), baseSeq);
+		DElt.nbBits = (BYTE)(consumed);
+		DElt.length = 1;
+		for (i = 0; i < skipSize; i++)
+			DTable[i] = DElt;
+	}
+
+	/* fill DTable */
+	{
+		U32 s;
+		for (s = 0; s < sortedListSize; s++) { /* note : sortedSymbols already skipped */
+			const U32 symbol = sortedSymbols[s].symbol;
+			const U32 weight = sortedSymbols[s].weight;
+			const U32 nbBits = nbBitsBaseline - weight;
+			const U32 length = 1 << (sizeLog - nbBits);
+			const U32 start = rankVal[weight];
+			U32 i = start;
+			const U32 end = start + length;
+
+			ZSTD_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+			DElt.nbBits = (BYTE)(nbBits + consumed);
+			DElt.length = 2;
+			do {
+				DTable[i++] = DElt;
+			} while (i < end); /* since length >= 1 */
+
+			rankVal[weight] += length;
+		}
+	}
+}
+
+typedef U32 rankVal_t[HUF_TABLELOG_MAX][HUF_TABLELOG_MAX + 1];
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+
+static void HUF_fillDTableX4(HUF_DEltX4 *DTable, const U32 targetLog, const sortedSymbol_t *sortedList, const U32 sortedListSize, const U32 *rankStart,
+			     rankVal_t rankValOrigin, const U32 maxWeight, const U32 nbBitsBaseline)
+{
+	U32 rankVal[HUF_TABLELOG_MAX + 1];
+	const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+	const U32 minBits = nbBitsBaseline - maxWeight;
+	U32 s;
+
+	memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+	/* fill DTable */
+	for (s = 0; s < sortedListSize; s++) {
+		const U16 symbol = sortedList[s].symbol;
+		const U32 weight = sortedList[s].weight;
+		const U32 nbBits = nbBitsBaseline - weight;
+		const U32 start = rankVal[weight];
+		const U32 length = 1 << (targetLog - nbBits);
+
+		if (targetLog - nbBits >= minBits) { /* enough room for a second symbol */
+			U32 sortedRank;
+			int minWeight = nbBits + scaleLog;
+			if (minWeight < 1)
+				minWeight = 1;
+			sortedRank = rankStart[minWeight];
+			HUF_fillDTableX4Level2(DTable + start, targetLog - nbBits, nbBits, rankValOrigin[nbBits], minWeight, sortedList + sortedRank,
+					       sortedListSize - sortedRank, nbBitsBaseline, symbol);
+		} else {
+			HUF_DEltX4 DElt;
+			ZSTD_writeLE16(&(DElt.sequence), symbol);
+			DElt.nbBits = (BYTE)(nbBits);
+			DElt.length = 1;
+			{
+				U32 const end = start + length;
+				U32 u;
+				for (u = start; u < end; u++)
+					DTable[u] = DElt;
+			}
+		}
+		rankVal[weight] += length;
+	}
+}
+
+size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+	U32 tableLog, maxW, sizeOfSort, nbSymbols;
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	U32 const maxTableLog = dtd.maxTableLog;
+	size_t iSize;
+	void *dtPtr = DTable + 1; /* force compiler to avoid strict-aliasing */
+	HUF_DEltX4 *const dt = (HUF_DEltX4 *)dtPtr;
+	U32 *rankStart;
+
+	rankValCol_t *rankVal;
+	U32 *rankStats;
+	U32 *rankStart0;
+	sortedSymbol_t *sortedSymbol;
+	BYTE *weightList;
+	size_t spaceUsed32 = 0;
+
+	HUF_STATIC_ASSERT((sizeof(rankValCol_t) & 3) == 0);
+
+	rankVal = (rankValCol_t *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
+	rankStats = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_MAX + 1;
+	rankStart0 = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_MAX + 2;
+	sortedSymbol = (sortedSymbol_t *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
+	weightList = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	rankStart = rankStart0 + 1;
+	memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
+
+	HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
+	if (maxTableLog > HUF_TABLELOG_MAX)
+		return ERROR(tableLog_tooLarge);
+	/* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
+
+	iSize = HUF_readStats_wksp(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
+	if (HUF_isError(iSize))
+		return iSize;
+
+	/* check result */
+	if (tableLog > maxTableLog)
+		return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
+
+	/* find maxWeight */
+	for (maxW = tableLog; rankStats[maxW] == 0; maxW--) {
+	} /* necessarily finds a solution before 0 */
+
+	/* Get start index of each weight */
+	{
+		U32 w, nextRankStart = 0;
+		for (w = 1; w < maxW + 1; w++) {
+			U32 curr = nextRankStart;
+			nextRankStart += rankStats[w];
+			rankStart[w] = curr;
+		}
+		rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
+		sizeOfSort = nextRankStart;
+	}
+
+	/* sort symbols by weight */
+	{
+		U32 s;
+		for (s = 0; s < nbSymbols; s++) {
+			U32 const w = weightList[s];
+			U32 const r = rankStart[w]++;
+			sortedSymbol[r].symbol = (BYTE)s;
+			sortedSymbol[r].weight = (BYTE)w;
+		}
+		rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
+	}
+
+	/* Build rankVal */
+	{
+		U32 *const rankVal0 = rankVal[0];
+		{
+			int const rescale = (maxTableLog - tableLog) - 1; /* tableLog <= maxTableLog */
+			U32 nextRankVal = 0;
+			U32 w;
+			for (w = 1; w < maxW + 1; w++) {
+				U32 curr = nextRankVal;
+				nextRankVal += rankStats[w] << (w + rescale);
+				rankVal0[w] = curr;
+			}
+		}
+		{
+			U32 const minBits = tableLog + 1 - maxW;
+			U32 consumed;
+			for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+				U32 *const rankValPtr = rankVal[consumed];
+				U32 w;
+				for (w = 1; w < maxW + 1; w++) {
+					rankValPtr[w] = rankVal0[w] >> consumed;
+				}
+			}
+		}
+	}
+
+	HUF_fillDTableX4(dt, maxTableLog, sortedSymbol, sizeOfSort, rankStart0, rankVal, maxW, tableLog + 1);
+
+	dtd.tableLog = (BYTE)maxTableLog;
+	dtd.tableType = 1;
+	memcpy(DTable, &dtd, sizeof(dtd));
+	return iSize;
+}
+
+static U32 HUF_decodeSymbolX4(void *op, BIT_DStream_t *DStream, const HUF_DEltX4 *dt, const U32 dtLog)
+{
+	size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
+	memcpy(op, dt + val, 2);
+	BIT_skipBits(DStream, dt[val].nbBits);
+	return dt[val].length;
+}
+
+static U32 HUF_decodeLastSymbolX4(void *op, BIT_DStream_t *DStream, const HUF_DEltX4 *dt, const U32 dtLog)
+{
+	size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
+	memcpy(op, dt + val, 1);
+	if (dt[val].length == 1)
+		BIT_skipBits(DStream, dt[val].nbBits);
+	else {
+		if (DStream->bitsConsumed < (sizeof(DStream->bitContainer) * 8)) {
+			BIT_skipBits(DStream, dt[val].nbBits);
+			if (DStream->bitsConsumed > (sizeof(DStream->bitContainer) * 8))
+				/* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+				DStream->bitsConsumed = (sizeof(DStream->bitContainer) * 8);
+		}
+	}
+	return 1;
+}
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr)         \
+	if (ZSTD_64bits() || (HUF_TABLELOG_MAX <= 12)) \
+	ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+	if (ZSTD_64bits())                     \
+	ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+FORCE_INLINE size_t HUF_decodeStreamX4(BYTE *p, BIT_DStream_t *bitDPtr, BYTE *const pEnd, const HUF_DEltX4 *const dt, const U32 dtLog)
+{
+	BYTE *const pStart = p;
+
+	/* up to 8 symbols at a time */
+	while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd - (sizeof(bitDPtr->bitContainer) - 1))) {
+		HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+		HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+		HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+		HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+	}
+
+	/* closer to end : up to 2 symbols at a time */
+	while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd - 2))
+		HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+	while (p <= pEnd - 2)
+		HUF_DECODE_SYMBOLX4_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
+
+	if (p < pEnd)
+		p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+	return p - pStart;
+}
+
+static size_t HUF_decompress1X4_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	BIT_DStream_t bitD;
+
+	/* Init */
+	{
+		size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
+		if (HUF_isError(errorCode))
+			return errorCode;
+	}
+
+	/* decode */
+	{
+		BYTE *const ostart = (BYTE *)dst;
+		BYTE *const oend = ostart + dstSize;
+		const void *const dtPtr = DTable + 1; /* force compiler to not use strict-aliasing */
+		const HUF_DEltX4 *const dt = (const HUF_DEltX4 *)dtPtr;
+		DTableDesc const dtd = HUF_getDTableDesc(DTable);
+		HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
+	}
+
+	/* check */
+	if (!BIT_endOfDStream(&bitD))
+		return ERROR(corruption_detected);
+
+	/* decoded size */
+	return dstSize;
+}
+
+size_t HUF_decompress1X4_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	if (dtd.tableType != 1)
+		return ERROR(GENERIC);
+	return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	const BYTE *ip = (const BYTE *)cSrc;
+
+	size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize);
+	if (HUF_isError(hSize))
+		return hSize;
+	if (hSize >= cSrcSize)
+		return ERROR(srcSize_wrong);
+	ip += hSize;
+	cSrcSize -= hSize;
+
+	return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+static size_t HUF_decompress4X4_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	if (cSrcSize < 10)
+		return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
+
+	{
+		const BYTE *const istart = (const BYTE *)cSrc;
+		BYTE *const ostart = (BYTE *)dst;
+		BYTE *const oend = ostart + dstSize;
+		const void *const dtPtr = DTable + 1;
+		const HUF_DEltX4 *const dt = (const HUF_DEltX4 *)dtPtr;
+
+		/* Init */
+		BIT_DStream_t bitD1;
+		BIT_DStream_t bitD2;
+		BIT_DStream_t bitD3;
+		BIT_DStream_t bitD4;
+		size_t const length1 = ZSTD_readLE16(istart);
+		size_t const length2 = ZSTD_readLE16(istart + 2);
+		size_t const length3 = ZSTD_readLE16(istart + 4);
+		size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+		const BYTE *const istart1 = istart + 6; /* jumpTable */
+		const BYTE *const istart2 = istart1 + length1;
+		const BYTE *const istart3 = istart2 + length2;
+		const BYTE *const istart4 = istart3 + length3;
+		size_t const segmentSize = (dstSize + 3) / 4;
+		BYTE *const opStart2 = ostart + segmentSize;
+		BYTE *const opStart3 = opStart2 + segmentSize;
+		BYTE *const opStart4 = opStart3 + segmentSize;
+		BYTE *op1 = ostart;
+		BYTE *op2 = opStart2;
+		BYTE *op3 = opStart3;
+		BYTE *op4 = opStart4;
+		U32 endSignal;
+		DTableDesc const dtd = HUF_getDTableDesc(DTable);
+		U32 const dtLog = dtd.tableLog;
+
+		if (length4 > cSrcSize)
+			return ERROR(corruption_detected); /* overflow */
+		{
+			size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+
+		/* 16-32 symbols per loop (4-8 symbols per stream) */
+		endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+		for (; (endSignal == BIT_DStream_unfinished) & (op4 < (oend - (sizeof(bitD4.bitContainer) - 1)));) {
+			HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+			HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+			HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+			HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+			HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+			HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+			HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+			HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+			HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+			HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+			HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+			HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+			HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+			HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+			HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+			HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+			endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+		}
+
+		/* check corruption */
+		if (op1 > opStart2)
+			return ERROR(corruption_detected);
+		if (op2 > opStart3)
+			return ERROR(corruption_detected);
+		if (op3 > opStart4)
+			return ERROR(corruption_detected);
+		/* note : op4 already verified within main loop */
+
+		/* finish bitStreams one by one */
+		HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+		HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+		HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+		HUF_decodeStreamX4(op4, &bitD4, oend, dt, dtLog);
+
+		/* check */
+		{
+			U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+			if (!endCheck)
+				return ERROR(corruption_detected);
+		}
+
+		/* decoded size */
+		return dstSize;
+	}
+}
+
+size_t HUF_decompress4X4_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	if (dtd.tableType != 1)
+		return ERROR(GENERIC);
+	return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	const BYTE *ip = (const BYTE *)cSrc;
+
+	size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize);
+	if (HUF_isError(hSize))
+		return hSize;
+	if (hSize >= cSrcSize)
+		return ERROR(srcSize_wrong);
+	ip += hSize;
+	cSrcSize -= hSize;
+
+	return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
+}
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+size_t HUF_decompress1X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc const dtd = HUF_getDTableDesc(DTable);
+	return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable)
+			     : HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc const dtd = HUF_getDTableDesc(DTable);
+	return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable)
+			     : HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+typedef struct {
+	U32 tableTime;
+	U32 decode256Time;
+} algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = {
+    /* single, double, quad */
+    {{0, 0}, {1, 1}, {2, 2}},		     /* Q==0 : impossible */
+    {{0, 0}, {1, 1}, {2, 2}},		     /* Q==1 : impossible */
+    {{38, 130}, {1313, 74}, {2151, 38}},     /* Q == 2 : 12-18% */
+    {{448, 128}, {1353, 74}, {2238, 41}},    /* Q == 3 : 18-25% */
+    {{556, 128}, {1353, 74}, {2238, 47}},    /* Q == 4 : 25-32% */
+    {{714, 128}, {1418, 74}, {2436, 53}},    /* Q == 5 : 32-38% */
+    {{883, 128}, {1437, 74}, {2464, 61}},    /* Q == 6 : 38-44% */
+    {{897, 128}, {1515, 75}, {2622, 68}},    /* Q == 7 : 44-50% */
+    {{926, 128}, {1613, 75}, {2730, 75}},    /* Q == 8 : 50-56% */
+    {{947, 128}, {1729, 77}, {3359, 77}},    /* Q == 9 : 56-62% */
+    {{1107, 128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177, 128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242, 128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349, 128}, {2644, 106}, {5260, 106}}, /* Q ==13 : 81-87% */
+    {{1455, 128}, {2422, 124}, {4174, 124}}, /* Q ==14 : 87-93% */
+    {{722, 128}, {1891, 145}, {1936, 146}},  /* Q ==15 : 93-99% */
+};
+
+/** HUF_selectDecoder() :
+*   Tells which decoder is likely to decode faster,
+*   based on a set of pre-determined metrics.
+*   @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize)
+{
+	/* decoder timing evaluation */
+	U32 const Q = (U32)(cSrcSize * 16 / dstSize); /* Q < 16 since dstSize > cSrcSize */
+	U32 const D256 = (U32)(dstSize >> 8);
+	U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+	U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+	DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, for cache eviction */
+
+	return DTime1 < DTime0;
+}
+
+typedef size_t (*decompressionAlgo)(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize);
+
+size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	/* validation checks */
+	if (dstSize == 0)
+		return ERROR(dstSize_tooSmall);
+	if (cSrcSize > dstSize)
+		return ERROR(corruption_detected); /* invalid */
+	if (cSrcSize == dstSize) {
+		memcpy(dst, cSrc, dstSize);
+		return dstSize;
+	} /* not compressed */
+	if (cSrcSize == 1) {
+		memset(dst, *(const BYTE *)cSrc, dstSize);
+		return dstSize;
+	} /* RLE */
+
+	{
+		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+		return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+			      : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
+	}
+}
+
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	/* validation checks */
+	if (dstSize == 0)
+		return ERROR(dstSize_tooSmall);
+	if ((cSrcSize >= dstSize) || (cSrcSize <= 1))
+		return ERROR(corruption_detected); /* invalid */
+
+	{
+		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+		return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+			      : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
+	}
+}
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	/* validation checks */
+	if (dstSize == 0)
+		return ERROR(dstSize_tooSmall);
+	if (cSrcSize > dstSize)
+		return ERROR(corruption_detected); /* invalid */
+	if (cSrcSize == dstSize) {
+		memcpy(dst, cSrc, dstSize);
+		return dstSize;
+	} /* not compressed */
+	if (cSrcSize == 1) {
+		memset(dst, *(const BYTE *)cSrc, dstSize);
+		return dstSize;
+	} /* RLE */
+
+	{
+		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+		return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+			      : HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
+	}
+}
--- a/lib/zstd/mem.h
+++ b/lib/zstd/mem.h
@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: (GPL-2.0 or BSD-3-Clause-Clear) */
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include <asm/unaligned.h>
+#include <compiler.h>
+#include <linux/string.h> /* memcpy */
+#include <linux/types.h>  /* size_t, ptrdiff_t */
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#define ZSTD_STATIC static __inline __attribute__((unused))
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+typedef uint8_t BYTE;
+typedef uint16_t U16;
+typedef int16_t S16;
+typedef uint32_t U32;
+typedef int32_t S32;
+typedef uint64_t U64;
+typedef int64_t S64;
+typedef ptrdiff_t iPtrDiff;
+typedef uintptr_t uPtrDiff;
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+ZSTD_STATIC unsigned ZSTD_32bits(void) { return sizeof(size_t) == 4; }
+ZSTD_STATIC unsigned ZSTD_64bits(void) { return sizeof(size_t) == 8; }
+
+#if defined(__LITTLE_ENDIAN)
+#define ZSTD_LITTLE_ENDIAN 1
+#else
+#define ZSTD_LITTLE_ENDIAN 0
+#endif
+
+ZSTD_STATIC unsigned ZSTD_isLittleEndian(void) { return ZSTD_LITTLE_ENDIAN; }
+
+ZSTD_STATIC U16 ZSTD_read16(const void *memPtr) { return get_unaligned((const U16 *)memPtr); }
+
+ZSTD_STATIC U32 ZSTD_read32(const void *memPtr) { return get_unaligned((const U32 *)memPtr); }
+
+ZSTD_STATIC U64 ZSTD_read64(const void *memPtr) { return get_unaligned((const U64 *)memPtr); }
+
+ZSTD_STATIC size_t ZSTD_readST(const void *memPtr) { return get_unaligned((const size_t *)memPtr); }
+
+ZSTD_STATIC void ZSTD_write16(void *memPtr, U16 value) { put_unaligned(value, (U16 *)memPtr); }
+
+ZSTD_STATIC void ZSTD_write32(void *memPtr, U32 value) { put_unaligned(value, (U32 *)memPtr); }
+
+ZSTD_STATIC void ZSTD_write64(void *memPtr, U64 value) { put_unaligned(value, (U64 *)memPtr); }
+
+/*=== Little endian r/w ===*/
+
+ZSTD_STATIC U16 ZSTD_readLE16(const void *memPtr) { return get_unaligned_le16(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeLE16(void *memPtr, U16 val) { put_unaligned_le16(val, memPtr); }
+
+ZSTD_STATIC U32 ZSTD_readLE24(const void *memPtr) { return ZSTD_readLE16(memPtr) + (((const BYTE *)memPtr)[2] << 16); }
+
+ZSTD_STATIC void ZSTD_writeLE24(void *memPtr, U32 val)
+{
+	ZSTD_writeLE16(memPtr, (U16)val);
+	((BYTE *)memPtr)[2] = (BYTE)(val >> 16);
+}
+
+ZSTD_STATIC U32 ZSTD_readLE32(const void *memPtr) { return get_unaligned_le32(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeLE32(void *memPtr, U32 val32) { put_unaligned_le32(val32, memPtr); }
+
+ZSTD_STATIC U64 ZSTD_readLE64(const void *memPtr) { return get_unaligned_le64(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeLE64(void *memPtr, U64 val64) { put_unaligned_le64(val64, memPtr); }
+
+ZSTD_STATIC size_t ZSTD_readLEST(const void *memPtr)
+{
+	if (ZSTD_32bits())
+		return (size_t)ZSTD_readLE32(memPtr);
+	else
+		return (size_t)ZSTD_readLE64(memPtr);
+}
+
+ZSTD_STATIC void ZSTD_writeLEST(void *memPtr, size_t val)
+{
+	if (ZSTD_32bits())
+		ZSTD_writeLE32(memPtr, (U32)val);
+	else
+		ZSTD_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+ZSTD_STATIC U32 ZSTD_readBE32(const void *memPtr) { return get_unaligned_be32(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeBE32(void *memPtr, U32 val32) { put_unaligned_be32(val32, memPtr); }
+
+ZSTD_STATIC U64 ZSTD_readBE64(const void *memPtr) { return get_unaligned_be64(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeBE64(void *memPtr, U64 val64) { put_unaligned_be64(val64, memPtr); }
+
+ZSTD_STATIC size_t ZSTD_readBEST(const void *memPtr)
+{
+	if (ZSTD_32bits())
+		return (size_t)ZSTD_readBE32(memPtr);
+	else
+		return (size_t)ZSTD_readBE64(memPtr);
+}
+
+ZSTD_STATIC void ZSTD_writeBEST(void *memPtr, size_t val)
+{
+	if (ZSTD_32bits())
+		ZSTD_writeBE32(memPtr, (U32)val);
+	else
+		ZSTD_writeBE64(memPtr, (U64)val);
+}
+
+/* function safe only for comparisons */
+ZSTD_STATIC U32 ZSTD_readMINMATCH(const void *memPtr, U32 length)
+{
+	switch (length) {
+	default:
+	case 4: return ZSTD_read32(memPtr);
+	case 3:
+		if (ZSTD_isLittleEndian())
+			return ZSTD_read32(memPtr) << 8;
+		else
+			return ZSTD_read32(memPtr) >> 8;
+	}
+}
+
+#endif /* MEM_H_MODULE */
--- a/lib/zstd/zstd_common.c
+++ b/lib/zstd/zstd_common.c
@ -0,0 +1,65 @@
+// SPDX-License-Identifier: (GPL-2.0 or BSD-3-Clause-Clear)
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "error_private.h"
+#include "zstd_internal.h" /* declaration of ZSTD_isError, ZSTD_getErrorName, ZSTD_getErrorCode, ZSTD_getErrorString, ZSTD_versionNumber */
+#include <linux/kernel.h>
+
+/*=**************************************************************
+*  Custom allocator
+****************************************************************/
+
+#define stack_push(stack, size)                                 \
+	({                                                      \
+		void *const ptr = ZSTD_PTR_ALIGN((stack)->ptr); \
+		(stack)->ptr = (char *)ptr + (size);            \
+		(stack)->ptr <= (stack)->end ? ptr : NULL;      \
+	})
+
+ZSTD_customMem ZSTD_initStack(void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem stackMem = {ZSTD_stackAlloc, ZSTD_stackFree, workspace};
+	ZSTD_stack *stack = (ZSTD_stack *)workspace;
+	/* Verify preconditions */
+	if (!workspace || workspaceSize < sizeof(ZSTD_stack) || workspace != ZSTD_PTR_ALIGN(workspace)) {
+		ZSTD_customMem error = {NULL, NULL, NULL};
+		return error;
+	}
+	/* Initialize the stack */
+	stack->ptr = workspace;
+	stack->end = (char *)workspace + workspaceSize;
+	stack_push(stack, sizeof(ZSTD_stack));
+	return stackMem;
+}
+
+void *ZSTD_stackAllocAll(void *opaque, size_t *size)
+{
+	ZSTD_stack *stack = (ZSTD_stack *)opaque;
+	*size = (BYTE const *)stack->end - (BYTE *)ZSTD_PTR_ALIGN(stack->ptr);
+	return stack_push(stack, *size);
+}
+
+void *ZSTD_stackAlloc(void *opaque, size_t size)
+{
+	ZSTD_stack *stack = (ZSTD_stack *)opaque;
+	return stack_push(stack, size);
+}
+void ZSTD_stackFree(void *opaque, void *address)
+{
+	(void)opaque;
+	(void)address;
+}
+
+void *ZSTD_malloc(size_t size, ZSTD_customMem customMem) { return customMem.customAlloc(customMem.opaque, size); }
+
+void ZSTD_free(void *ptr, ZSTD_customMem customMem)
+{
+	if (ptr != NULL)
+		customMem.customFree(customMem.opaque, ptr);
+}
--- a/lib/zstd/zstd_internal.h
+++ b/lib/zstd/zstd_internal.h
@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: (GPL-2.0 or BSD-3-Clause-Clear) */
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ */
+
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+#define FORCE_INLINE static __always_inline
+#define FORCE_NOINLINE static noinline
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "error_private.h"
+#include "mem.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/xxhash.h>
+#include <linux/zstd.h>
+
+/*-*************************************
+*  shared macros
+***************************************/
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define CHECK_F(f)                       \
+	{                                \
+		size_t const errcod = f; \
+		if (ERR_isError(errcod)) \
+			return errcod;   \
+	} /* check and Forward error code */
+#define CHECK_E(f, e)                    \
+	{                                \
+		size_t const errcod = f; \
+		if (ERR_isError(errcod)) \
+			return ERROR(e); \
+	} /* check and send Error code */
+#define ZSTD_STATIC_ASSERT(c)                                   \
+	{                                                       \
+		enum { ZSTD_static_assert = 1 / (int)(!!(c)) }; \
+	}
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTD_OPT_NUM (1 << 12)
+#define ZSTD_DICT_MAGIC 0xEC30A437 /* v0.7+ */
+
+#define ZSTD_REP_NUM 3		      /* number of repcodes */
+#define ZSTD_REP_CHECK (ZSTD_REP_NUM) /* number of repcodes to check by the optimal parser */
+#define ZSTD_REP_MOVE (ZSTD_REP_NUM - 1)
+#define ZSTD_REP_MOVE_OPT (ZSTD_REP_NUM)
+static const U32 repStartValue[ZSTD_REP_NUM] = {1, 4, 8};
+
+#define KB *(1 << 10)
+#define MB *(1 << 20)
+#define GB *(1U << 30)
+
+#define BIT7 128
+#define BIT6 64
+#define BIT5 32
+#define BIT4 16
+#define BIT1 2
+#define BIT0 1
+
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
+static const size_t ZSTD_fcs_fieldSize[4] = {0, 2, 4, 8};
+static const size_t ZSTD_did_fieldSize[4] = {0, 1, 2, 4};
+
+#define ZSTD_BLOCKHEADERSIZE 3 /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
+typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+
+#define MIN_SEQUENCES_SIZE 1									  /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */
+
+#define HufLog 12
+typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+#define EQUAL_READ32 4
+
+#define Litbits 8
+#define MaxLit ((1 << Litbits) - 1)
+#define MaxML 52
+#define MaxLL 35
+#define MaxOff 28
+#define MaxSeq MAX(MaxLL, MaxML) /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog 9
+#define LLFSELog 9
+#define OffFSELog 8
+
+static const U32 LL_bits[MaxLL + 1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+static const S16 LL_defaultNorm[MaxLL + 1] = {4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, -1, -1, -1, -1};
+#define LL_DEFAULTNORMLOG 6 /* for static allocation */
+static const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
+
+static const U32 ML_bits[MaxML + 1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0, 0,
+				       0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+static const S16 ML_defaultNorm[MaxML + 1] = {1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1, 1,
+					      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1};
+#define ML_DEFAULTNORMLOG 6 /* for static allocation */
+static const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
+
+static const S16 OF_defaultNorm[MaxOff + 1] = {1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1};
+#define OF_DEFAULTNORMLOG 5 /* for static allocation */
+static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+ZSTD_STATIC void ZSTD_copy8(void *dst, const void *src) {
+	memcpy(dst, src, 8);
+}
+/*! ZSTD_wildcopy() :
+*   custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */
+#define WILDCOPY_OVERLENGTH 8
+ZSTD_STATIC void ZSTD_wildcopy(void *dst, const void *src, ptrdiff_t length)
+{
+	const BYTE* ip = (const BYTE*)src;
+	BYTE* op = (BYTE*)dst;
+	BYTE* const oend = op + length;
+	/* Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81388.
+	 * Avoid the bad case where the loop only runs once by handling the
+	 * special case separately. This doesn't trigger the bug because it
+	 * doesn't involve pointer/integer overflow.
+	 */
+	if (length <= 8)
+		return ZSTD_copy8(dst, src);
+	do {
+		ZSTD_copy8(op, ip);
+		op += 8;
+		ip += 8;
+	} while (op < oend);
+}
+
+/*-*******************************************
+*  Private interfaces
+*********************************************/
+typedef struct ZSTD_stats_s ZSTD_stats_t;
+
+typedef struct {
+	U32 off;
+	U32 len;
+} ZSTD_match_t;
+
+typedef struct {
+	U32 price;
+	U32 off;
+	U32 mlen;
+	U32 litlen;
+	U32 rep[ZSTD_REP_NUM];
+} ZSTD_optimal_t;
+
+typedef struct seqDef_s {
+	U32 offset;
+	U16 litLength;
+	U16 matchLength;
+} seqDef;
+
+typedef struct {
+	seqDef *sequencesStart;
+	seqDef *sequences;
+	BYTE *litStart;
+	BYTE *lit;
+	BYTE *llCode;
+	BYTE *mlCode;
+	BYTE *ofCode;
+	U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+	U32 longLengthPos;
+	/* opt */
+	ZSTD_optimal_t *priceTable;
+	ZSTD_match_t *matchTable;
+	U32 *matchLengthFreq;
+	U32 *litLengthFreq;
+	U32 *litFreq;
+	U32 *offCodeFreq;
+	U32 matchLengthSum;
+	U32 matchSum;
+	U32 litLengthSum;
+	U32 litSum;
+	U32 offCodeSum;
+	U32 log2matchLengthSum;
+	U32 log2matchSum;
+	U32 log2litLengthSum;
+	U32 log2litSum;
+	U32 log2offCodeSum;
+	U32 factor;
+	U32 staticPrices;
+	U32 cachedPrice;
+	U32 cachedLitLength;
+	const BYTE *cachedLiterals;
+} seqStore_t;
+
+const seqStore_t *ZSTD_getSeqStore(const ZSTD_CCtx *ctx);
+void ZSTD_seqToCodes(const seqStore_t *seqStorePtr);
+int ZSTD_isSkipFrame(ZSTD_DCtx *dctx);
+
+/*= Custom memory allocation functions */
+typedef void *(*ZSTD_allocFunction)(void *opaque, size_t size);
+typedef void (*ZSTD_freeFunction)(void *opaque, void *address);
+typedef struct {
+	ZSTD_allocFunction customAlloc;
+	ZSTD_freeFunction customFree;
+	void *opaque;
+} ZSTD_customMem;
+
+void *ZSTD_malloc(size_t size, ZSTD_customMem customMem);
+void ZSTD_free(void *ptr, ZSTD_customMem customMem);
+
+/*====== stack allocation  ======*/
+
+typedef struct {
+	void *ptr;
+	const void *end;
+} ZSTD_stack;
+
+#define ZSTD_ALIGN(x) ALIGN(x, sizeof(size_t))
+#define ZSTD_PTR_ALIGN(p) PTR_ALIGN(p, sizeof(size_t))
+
+ZSTD_customMem ZSTD_initStack(void *workspace, size_t workspaceSize);
+
+void *ZSTD_stackAllocAll(void *opaque, size_t *size);
+void *ZSTD_stackAlloc(void *opaque, size_t size);
+void ZSTD_stackFree(void *opaque, void *address);
+
+/*======  common function  ======*/
+
+ZSTD_STATIC U32 ZSTD_highbit32(U32 val) { return 31 - __builtin_clz(val); }
+
+/* hidden functions */
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx *cctx);
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx *cctx);
+size_t ZSTD_freeDCtx(ZSTD_DCtx *dctx);
+size_t ZSTD_freeCDict(ZSTD_CDict *cdict);
+size_t ZSTD_freeDDict(ZSTD_DDict *cdict);
+size_t ZSTD_freeCStream(ZSTD_CStream *zcs);
+size_t ZSTD_freeDStream(ZSTD_DStream *zds);
+
+#endif /* ZSTD_CCOMMON_H_MODULE */
--- a/lib/zstd/zstd_opt.h
+++ b/lib/zstd/zstd_opt.h
--- a/test/dm/cache.c
+++ b/test/dm/cache.c
@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Intel Corporation <www.intel.com>
+ */
+
+#include <common.h>
+#include <dm.h>
+#include <dm/test.h>
+
+static int dm_test_reset(struct unit_test_state *uts)
+{
+	struct udevice *dev_cache;
+	struct cache_info;
+
+	ut_assertok(uclass_get_device(UCLASS_CACHE, 0, &dev_cache));
+	ut_assertok(cache_get_info(dev, &info));
+
+	return 0;
+}
+DM_TEST(dm_test_reset, DM_TESTF_SCAN_FDT);
--- a/test/print_ut.c
+++ b/test/print_ut.c
@ -79,14 +79,18 @@ static int do_ut_print(cmd_tbl_t *cmdtp, int flag, int argc,
 	assert(s == str);
 	assert(!strcmp("\n\nU-Boo\n\n", s));

-	s = display_options_get_banner(true, str, 1);
-	assert(s == str);
-	assert(!strcmp("", s));
+	/* Assert that we do not overwrite memory before the buffer */
+	str[0] = '`';
+	s = display_options_get_banner(true, str + 1, 1);
+	assert(s == str + 1);
+	assert(!strcmp("`", str));

-	s = display_options_get_banner(true, str, 2);
-	assert(s == str);
-	assert(!strcmp("\n", s));
+	str[0] = '~';
+	s = display_options_get_banner(true, str + 1, 2);
+	assert(s == str + 1);
+	assert(!strcmp("~\n", str));

+	/* The last two characters are set to \n\n for all buffer sizes > 2 */
 	s = display_options_get_banner(false, str, sizeof(str));
 	assert(s == str);
 	assert(!strcmp("U-Boot \n\n", s));
--- a/tools/env/fw_env.c
+++ b/tools/env/fw_env.c
@ -1742,7 +1742,7 @@ static int parse_config(struct env_opts *opts)

 		if (ENVSIZE(0) != ENVSIZE(1)) {
 			fprintf(stderr,
-				"Redundant environments have unequal size");
+				"Redundant environments have unequal size\n");
 			return -1;
 		}
 	}