6 files changed, 116 insertions, 63 deletions
diff --git a/payloads/libpayload/include/cbfs_core.h b/payloads/libpayload/include/cbfs_core.h
index 4c59f4131a..4cbc4c0628 100644
--- a/payloads/libpayload/include/cbfs_core.h
+++ b/payloads/libpayload/include/cbfs_core.h
@@ -58,6 +58,7 @@
 
 #define CBFS_COMPRESS_NONE  0
 #define CBFS_COMPRESS_LZMA  1
+#define CBFS_COMPRESS_LZ4   2
 
 /** These are standard component types for well known
     components (i.e - those that coreboot needs to consume.
diff --git a/payloads/libpayload/include/lz4.h b/payloads/libpayload/include/lz4.h
index 1f2830db46..d2120a48fc 100644
--- a/payloads/libpayload/include/lz4.h
+++ b/payloads/libpayload/include/lz4.h
@@ -36,7 +36,10 @@
 
 /* Decompresses an LZ4F image (multiple LZ4 blocks with frame header) from src
  * to dst, ensuring that it doesn't read more than srcn bytes and doesn't write
- * more than dstn. Buffer sizes must stay below 2GB.
+ * more than dstn. Buffer sizes must stay below 2GB. Can decompress files loaded
+ * to the end of a buffer in-place, as long as buffer is larger than the final
+ * output size. (Usually just a few bytes, but may be up to (8 + dstn/255) in
+ * worst case. Will reliably return an error if buffer was too small.)
  * Returns amount of decompressed bytes, or 0 on error.
  */
 size_t ulz4fn(const void *src, size_t srcn, void *dst, size_t dstn);
@@ -44,4 +47,4 @@ size_t ulz4fn(const void *src, size_t srcn, void *dst, size_t dstn);
 /* Same as ulz4fn() but does not perform any bounds checks. */
 size_t ulz4f(const void *src, void *dst);
 
-#endif /* __LZO_H_ */
+#endif /* __LZ4_H_ */
diff --git a/payloads/libpayload/libcbfs/cbfs.c b/payloads/libpayload/libcbfs/cbfs.c
index 49e4941181..a1cc7e443c 100644
--- a/payloads/libpayload/libcbfs/cbfs.c
+++ b/payloads/libpayload/libcbfs/cbfs.c
@@ -35,11 +35,16 @@
 #  include <lzma.h>
 #  define CBFS_CORE_WITH_LZMA
 # endif
+# if IS_ENABLED(CONFIG_LP_LZ4)
+#  include <lz4.h>
+#  define CBFS_CORE_WITH_LZ4
+# endif
 # define CBFS_MINI_BUILD
 #elif defined(__SMM__)
 # define CBFS_MINI_BUILD
 #else
 # define CBFS_CORE_WITH_LZMA
+# define CBFS_CORE_WITH_LZ4
 # include <lib.h>
 #endif
 
diff --git a/payloads/libpayload/libcbfs/cbfs_core.c b/payloads/libpayload/libcbfs/cbfs_core.c
index ddf0da5f42..c32d262b33 100644
--- a/payloads/libpayload/libcbfs/cbfs_core.c
+++ b/payloads/libpayload/libcbfs/cbfs_core.c
@@ -34,6 +34,9 @@
  * CBFS_CORE_WITH_LZMA (must be #define)
  *      if defined, ulzma() must exist for decompression of data streams
  *
+ * CBFS_CORE_WITH_LZ4 (must be #define)
+ *      if defined, ulz4f() must exist for decompression of data streams
+ *
  * ERROR(x...)
  *      print an error message x (in printf format)
  *
@@ -330,6 +333,10 @@ int cbfs_decompress(int algo, void *src, void *dst, int len)
 		case CBFS_COMPRESS_LZMA:
 			return ulzma(src, dst);
 #endif
+#ifdef CBFS_CORE_WITH_LZ4
+		case CBFS_COMPRESS_LZ4:
+			return ulz4f(src, dst);
+#endif
 		default:
 			ERROR("tried to decompress %d bytes with algorithm #%x,"
 			      "but that algorithm id is unsupported.\n", len,
diff --git a/payloads/libpayload/liblz4/lz4.c b/payloads/libpayload/liblz4/lz4.c.inc
index fb89090ee2..b3be4e5b44 100644
--- a/payloads/libpayload/liblz4/lz4.c
+++ b/payloads/libpayload/liblz4/lz4.c.inc
@@ -37,12 +37,19 @@
 *  Reading and writing into memory
 **************************************/
 
-/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */
+/* customized variant of memcpy, which can overwrite up to 7 bytes beyond dstEnd */
 static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
 {
     BYTE* d = (BYTE*)dstPtr;
     const BYTE* s = (const BYTE*)srcPtr;
-    BYTE* e = (BYTE*)dstEnd;
+    BYTE* const e = (BYTE*)dstEnd;
+
+#if 0
+    const size_t l2 = 8 - (((size_t)d) & (sizeof(void*)-1));
+    LZ4_copy8(d,s); if (d>e-9) return;
+    d+=l2; s+=l2;
+#endif /* join to align */
+
     do { LZ4_copy8(d,s); d+=8; s+=8; } while (d<e);
 }
 
@@ -52,9 +59,9 @@ static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
 **************************************/
 #define MINMATCH 4
 
-#define COPYLENGTH 8
+#define WILDCOPYLENGTH 8
 #define LASTLITERALS 5
-#define MFLIMIT (COPYLENGTH+MINMATCH)
+#define MFLIMIT (WILDCOPYLENGTH+MINMATCH)
 static const int LZ4_minLength = (MFLIMIT+1);
 
 #define KB *(1 <<10)
@@ -114,11 +121,12 @@ FORCE_INLINE int LZ4_decompress_generic(
     const BYTE* const lowLimit = lowPrefix - dictSize;
 
     const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize;
-    const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};
-    const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
+    const unsigned dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};
+    const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
 
     const int safeDecode = (endOnInput==endOnInputSize);
     const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
+    const int inPlaceDecode = ((ip >= op) && (ip < oend));
 
 
     /* Special cases */
@@ -133,6 +141,9 @@ FORCE_INLINE int LZ4_decompress_generic(
         unsigned token;
         size_t length;
         const BYTE* match;
+        size_t offset;
+
+        if (unlikely((inPlaceDecode) && (op + WILDCOPYLENGTH > ip))) goto _output_error;   /* output stream ran over input stream */
 
         /* get literal length */
         token = *ip++;
@@ -144,7 +155,7 @@ FORCE_INLINE int LZ4_decompress_generic(
                 s = *ip++;
                 length += s;
             }
-            while (likely((endOnInput)?ip<iend-RUN_MASK:1) && (s==255));
+            while ( likely(endOnInput ? ip<iend-RUN_MASK : 1) && (s==255) );
             if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)(op))) goto _output_error;   /* overflow detection */
             if ((safeDecode) && unlikely((size_t)(ip+length)<(size_t)(ip))) goto _output_error;   /* overflow detection */
         }
@@ -152,7 +163,7 @@ FORCE_INLINE int LZ4_decompress_generic(
         /* copy literals */
         cpy = op+length;
         if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
-            || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
+            || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)))
         {
             if (partialDecoding)
             {
@@ -164,7 +175,7 @@ FORCE_INLINE int LZ4_decompress_generic(
                 if ((!endOnInput) && (cpy != oend)) goto _output_error;       /* Error : block decoding must stop exactly there */
                 if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   /* Error : input must be consumed */
             }
-            memcpy(op, ip, length);
+            memmove(op, ip, length);
             ip += length;
             op += length;
             break;     /* Necessarily EOF, due to parsing restrictions */
@@ -173,8 +184,9 @@ FORCE_INLINE int LZ4_decompress_generic(
         ip += length; op = cpy;
 
         /* get offset */
-        match = cpy - LZ4_readLE16(ip); ip+=2;
-        if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error;   /* Error : offset outside destination buffer */
+        offset = LZ4_readLE16(ip); ip+=2;
+        match = op - offset;
+        if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error;   /* Error : offset outside buffers */
 
         /* get matchlength */
         length = token & ML_MASK;
@@ -204,12 +216,12 @@ FORCE_INLINE int LZ4_decompress_generic(
             }
             else
             {
-                /* match encompass external dictionary and current segment */
+                /* match encompass external dictionary and current block */
                 size_t copySize = (size_t)(lowPrefix-match);
                 memcpy(op, dictEnd - copySize, copySize);
                 op += copySize;
                 copySize = length - copySize;
-                if (copySize > (size_t)(op-lowPrefix))   /* overlap within current segment */
+                if (copySize > (size_t)(op-lowPrefix))   /* overlap copy */
                 {
                     BYTE* const endOfMatch = op + copySize;
                     const BYTE* copyFrom = lowPrefix;
@@ -224,28 +236,30 @@ FORCE_INLINE int LZ4_decompress_generic(
             continue;
         }
 
-        /* copy repeated sequence */
+        /* copy match within block */
         cpy = op + length;
-        if (unlikely((op-match)<8))
+        if (unlikely(offset<8))
         {
-            const size_t dec64 = dec64table[op-match];
+            const int dec64 = dec64table[offset];
             op[0] = match[0];
             op[1] = match[1];
             op[2] = match[2];
             op[3] = match[3];
-            match += dec32table[op-match];
-            LZ4_copy4(op+4, match);
-            op += 8; match -= dec64;
-        } else { LZ4_copy8(op, match); op+=8; match+=8; }
+            match += dec32table[offset];
+            memcpy(op+4, match, 4);
+            match -= dec64;
+        } else { LZ4_copy8(op, match); match+=8; }
+        op += 8;
 
         if (unlikely(cpy>oend-12))
         {
-            if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last LASTLITERALS bytes must be literals */
-            if (op < oend-8)
+            BYTE* const oCopyLimit = oend-(WILDCOPYLENGTH-1);
+            if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
+            if (op < oCopyLimit)
             {
-                LZ4_wildCopy(op, match, oend-8);
-                match += (oend-8) - op;
-                op = oend-8;
+                LZ4_wildCopy(op, match, oCopyLimit);
+                match += oCopyLimit - op;
+                op = oCopyLimit;
             }
             while (op<cpy) *op++ = *match++;
         }
diff --git a/payloads/libpayload/liblz4/lz4_wrapper.c b/payloads/libpayload/liblz4/lz4_wrapper.c
index 431fb55cc0..6de140e403 100644
--- a/payloads/libpayload/liblz4/lz4_wrapper.c
+++ b/payloads/libpayload/liblz4/lz4_wrapper.c
@@ -29,7 +29,6 @@
  * SUCH DAMAGE.
  */
 
-#include <assert.h>
 #include <endian.h>
 #include <libpayload.h>
 #include <lz4.h>
@@ -38,9 +37,28 @@
  * seem to be very inefficient in practice (at least on ARM64). Since libpayload
  * knows about endinaness and allows some basic assumptions (such as unaligned
  * access support), we can easily write the ones we need ourselves. */
-static u16 LZ4_readLE16(const void *src) { return le16toh(*(u16 *)src); }
-static void LZ4_copy4(void *dst, const void *src) { *(u32 *)dst = *(u32 *)src; }
-static void LZ4_copy8(void *dst, const void *src) { *(u64 *)dst = *(u64 *)src; }
+static uint16_t LZ4_readLE16(const void *src)
+{
+	return le16toh(*(uint16_t *)src);
+}
+static void LZ4_copy8(void *dst, const void *src)
+{
+/* ARM32 needs to be a special snowflake to prevent GCC from coalescing the
+ * access into LDRD/STRD (which don't support unaligned accesses). */
+#ifdef __arm__
+	uint32_t x0, x1;
+	asm volatile (
+		"ldr %[x0], [%[src]]\n\t"
+		"ldr %[x1], [%[src], #4]\n\t"
+		"str %[x0], [%[dst]]\n\t"
+		"str %[x1], [%[dst], #4]\n\t"
+		: [x0]"=r"(x0), [x1]"=r"(x1)
+		: [src]"r"(src), [dst]"r"(dst)
+		: "memory" );
+#else
+	*(uint64_t *)dst = *(const uint64_t *)src;
+#endif
+}
 
 typedef  uint8_t BYTE;
 typedef uint16_t U16;
@@ -52,58 +70,59 @@ typedef uint64_t U64;
 #define likely(expr) __builtin_expect((expr) != 0, 1)
 #define unlikely(expr) __builtin_expect((expr) != 0, 0)
 
-/* Unaltered (except removing unrelated code) from github.com/Cyan4973/lz4. */
-#include "lz4.c"	/* #include for inlining, do not link! */
+/* Unaltered (just removed unrelated code) from github.com/Cyan4973/lz4/dev. */
+#include "lz4.c.inc"	/* #include for inlining, do not link! */
 
 #define LZ4F_MAGICNUMBER 0x184D2204
 
 struct lz4_frame_header {
-	u32 magic;
+	uint32_t magic;
 	union {
-		u8 flags;
+		uint8_t flags;
 		struct {
-			u8 reserved0		: 2;
-			u8 has_content_checksum	: 1;
-			u8 has_content_size	: 1;
-			u8 has_block_checksum	: 1;
-			u8 independent_blocks	: 1;
-			u8 version		: 2;
+			uint8_t reserved0		: 2;
+			uint8_t has_content_checksum	: 1;
+			uint8_t has_content_size	: 1;
+			uint8_t has_block_checksum	: 1;
+			uint8_t independent_blocks	: 1;
+			uint8_t version			: 2;
 		};
 	};
 	union {
-		u8 block_descriptor;
+		uint8_t block_descriptor;
 		struct {
-			u8 reserved1		: 4;
-			u8 max_block_size	: 3;
-			u8 reserved2		: 1;
+			uint8_t reserved1		: 4;
+			uint8_t max_block_size		: 3;
+			uint8_t reserved2		: 1;
 		};
 	};
-	/* + u64 content_size iff has_content_size is set */
-	/* + u8 header_checksum */
+	/* + uint64_t content_size iff has_content_size is set */
+	/* + uint8_t header_checksum */
 } __attribute__((packed));
 
 struct lz4_block_header {
 	union {
-		u32 raw;
+		uint32_t raw;
 		struct {
-			u32 size		: 31;
-			u32 not_compressed	: 1;
+			uint32_t size		: 31;
+			uint32_t not_compressed	: 1;
 		};
 	};
 	/* + size bytes of data */
-	/* + u32 block_checksum iff has_block_checksum is set */
+	/* + uint32_t block_checksum iff has_block_checksum is set */
 } __attribute__((packed));
 
 size_t ulz4fn(const void *src, size_t srcn, void *dst, size_t dstn)
 {
 	const void *in = src;
 	void *out = dst;
+	size_t out_size = 0;
 	int has_block_checksum;
 
 	{ /* With in-place decompression the header may become invalid later. */
 		const struct lz4_frame_header *h = in;
 
-		if (srcn < sizeof(*h) + sizeof(u64) + sizeof(u8))
+		if (srcn < sizeof(*h) + sizeof(uint64_t) + sizeof(uint8_t))
 			return 0;	/* input overrun */
 
 		/* We assume there's always only a single, standard frame. */
@@ -117,25 +136,27 @@ size_t ulz4fn(const void *src, size_t srcn, void *dst, size_t dstn)
 
 		in += sizeof(*h);
 		if (h->has_content_size)
-			in += sizeof(u64);
-		in += sizeof(u8);
+			in += sizeof(uint64_t);
+		in += sizeof(uint8_t);
 	}
 
 	while (1) {
-		struct lz4_block_header b = { .raw = le32toh(*(u32 *)in) };
+		struct lz4_block_header b = { .raw = le32toh(*(uint32_t *)in) };
 		in += sizeof(struct lz4_block_header);
 
-		if (in - src + b.size > srcn)
-			return 0;		/* input overrun */
+		if ((size_t)(in - src) + b.size > srcn)
+			break;			/* input overrun */
 
-		if (!b.size)
-			return out - dst;	/* decompression successful */
+		if (!b.size) {
+			out_size = out - dst;
+			break;			/* decompression successful */
+		}
 
 		if (b.not_compressed) {
-			size_t size = MIN((u32)b.size, dst + dstn - out);
+			size_t size = MIN((uint32_t)b.size, dst + dstn - out);
 			memcpy(out, in, size);
 			if (size < b.size)
-				return 0;	/* output overrun */
+				break;		/* output overrun */
 			else
 				out += size;
 		} else {
@@ -144,15 +165,17 @@ size_t ulz4fn(const void *src, size_t srcn, void *dst, size_t dstn)
 					dst + dstn - out, endOnInputSize,
 					full, 0, noDict, out, NULL, 0);
 			if (ret < 0)
-				return 0;	/* decompression error */
+				break;		/* decompression error */
 			else
 				out += ret;
 		}
 
 		in += b.size;
 		if (has_block_checksum)
-			in += sizeof(u32);
+			in += sizeof(uint32_t);
 	}
+
+	return out_size;
 }
 
 size_t ulz4f(const void *src, void *dst)