/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ #include /* See RFC 1071 for mathematical explanations of why we can first sum in a larger register and then narrow down, why we don't need to worry about endianness, etc. */ uint16_t ipchksum(const void *data, size_t size) { const uint8_t *p1 = data; unsigned long wide_sum = 0; uint32_t sum = 0; size_t i = 0; #if defined(__aarch64__) size_t size16 = size / 16; const uint64_t *p8 = data; if (size16) { unsigned long tmp1, tmp2; i = size16 * 16; asm ( "adds xzr, xzr, xzr\n\t" /* init carry flag for addition */ "1:\n\t" "ldp %[v1], %[v2], [%[p8]], #16\n\t" "adcs %[wsum], %[wsum], %[v1]\n\t" "adcs %[wsum], %[wsum], %[v2]\n\t" "sub %[size16], %[size16], #1\n\t" "cbnz %[size16], 1b\n\t" "adcs %[wsum], %[wsum], xzr\n\t" /* use up last carry */ : [v1] "=r" (tmp1), [v2] "=r" (tmp2), [wsum] "+r" (wide_sum), [p8] "+r" (p8), [size16] "+r" (size16) :: "cc" ); } #endif while (wide_sum) { sum += wide_sum & 0xFFFF; wide_sum >>= 16; } sum = (sum & 0xFFFF) + (sum >> 16); for (; i < size; i++) { uint32_t v = p1[i]; if (i % 2) v <<= 8; sum += v; /* Doing this unconditionally seems to be faster. */ sum = (sum & 0xFFFF) + (sum >> 16); } return (uint16_t)~sum; } uint16_t ipchksum_add(size_t offset, uint16_t first, uint16_t second) { first = ~first; second = ~second; /* * Since the checksum is calculated in 16-bit chunks, if the offset at which * the data covered by the second checksum would start (if both data streams * came one after the other) is odd, that means the second stream starts in * the middle of a 16-bit chunk. This means the second checksum is byte * swapped compared to what we need it to be, and we must swap it back. */ if (offset % 2) second = (second >> 8) | (second << 8); uint32_t sum = first + second; sum = (sum & 0xFFFF) + (sum >> 16); return (uint16_t)~sum; }