1 files changed, 2012 insertions, 0 deletions
diff --git a/src/vendorcode/cavium/bdk/libdram/dram-tune-ddr3.c b/src/vendorcode/cavium/bdk/libdram/dram-tune-ddr3.c
new file mode 100644
index 0000000000..e0e9d4442c
--- /dev/null
+++ b/src/vendorcode/cavium/bdk/libdram/dram-tune-ddr3.c
@@ -0,0 +1,2012 @@
+/***********************license start***********************************
+* Copyright (c) 2003-2017  Cavium Inc. (support@cavium.com). All rights
+* reserved.
+*
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met:
+*
+*   * Redistributions of source code must retain the above copyright
+*     notice, this list of conditions and the following disclaimer.
+*
+*   * Redistributions in binary form must reproduce the above
+*     copyright notice, this list of conditions and the following
+*     disclaimer in the documentation and/or other materials provided
+*     with the distribution.
+*
+*   * Neither the name of Cavium Inc. nor the names of
+*     its contributors may be used to endorse or promote products
+*     derived from this software without specific prior written
+*     permission.
+*
+* This Software, including technical data, may be subject to U.S. export
+* control laws, including the U.S. Export Administration Act and its
+* associated regulations, and may be subject to export or import
+* regulations in other countries.
+*
+* TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+* AND WITH ALL FAULTS AND CAVIUM INC. MAKES NO PROMISES, REPRESENTATIONS OR
+* WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH RESPECT
+* TO THE SOFTWARE, INCLUDING ITS CONDITION, ITS CONFORMITY TO ANY
+* REPRESENTATION OR DESCRIPTION, OR THE EXISTENCE OF ANY LATENT OR PATENT
+* DEFECTS, AND CAVIUM SPECIFICALLY DISCLAIMS ALL IMPLIED (IF ANY) WARRANTIES
+* OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS FOR A PARTICULAR
+* PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS, QUIET ENJOYMENT,
+* QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. THE ENTIRE  RISK
+* ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE LIES WITH YOU.
+***********************license end**************************************/
+#include <bdk.h>
+#include "dram-internal.h"
+
+// if enhanced verbosity levels are defined, use them 
+#if defined(VB_PRT)
+#define ddr_print2(format, ...) VB_PRT(VBL_FAE,  format, ##__VA_ARGS__)
+#define ddr_print3(format, ...) VB_PRT(VBL_TME,  format, ##__VA_ARGS__)
+#define ddr_print4(format, ...) VB_PRT(VBL_DEV,  format, ##__VA_ARGS__)
+#define ddr_print5(format, ...) VB_PRT(VBL_DEV3, format, ##__VA_ARGS__)
+#else
+#define ddr_print2 ddr_print
+#define ddr_print4 ddr_print
+#define ddr_print5 ddr_print
+#endif
+
+static  int64_t test_dram_byte_threads_done;
+static uint64_t test_dram_byte_threads_errs;
+static uint64_t test_dram_byte_lmc_errs[4];
+
+#if 0
+/*
+ * Suggested testing patterns.
+ */
+static const uint64_t test_pattern_2[] = {
+    0xFFFFFFFFFFFFFFFFULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0xFFFFFFFFFFFFFFFFULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0xFFFFFFFFFFFFFFFFULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0xFFFFFFFFFFFFFFFFULL,
+    0x5555555555555555ULL,
+    0xFFFFFFFFFFFFFFFFULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xFFFFFFFFFFFFFFFFULL,
+    0x5555555555555555ULL,
+};
+ /*
+ *  or possibly
+ */
+static const uint64_t test_pattern_3[] = {
+    0xFDFDFDFDFDFDFDFDULL,
+    0x8787878787878787ULL,
+    0xFEFEFEFEFEFEFEFEULL,
+    0xC3C3C3C3C3C3C3C3ULL,
+    0x7F7F7F7F7F7F7F7FULL,
+    0xE1E1E1E1E1E1E1E1ULL,
+    0xBFBFBFBFBFBFBFBFULL,
+    0xF0F0F0F0F0F0F0F0ULL,
+    0xDFDFDFDFDFDFDFDFULL,
+    0x7878787878787878ULL,
+    0xEFEFEFEFEFEFEFEFULL,
+    0x3C3C3C3C3C3C3C3CULL,
+    0xF7F7F7F7F7F7F7F7ULL,
+    0x1E1E1E1E1E1E1E1EULL,
+    0xFBFBFBFBFBFBFBFBULL,
+    0x0F0F0F0F0F0F0F0FULL,
+};
+
+static const uint64_t test_pattern_1[] = {
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+#if 0 // only need a cacheline size
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+    0xAAAAAAAAAAAAAAAAULL,
+    0x5555555555555555ULL,
+#endif
+};
+
+// setup default for test pattern array
+static const uint64_t *dram_tune_test_pattern = test_pattern_1;
+#endif
+
+// set this to 1 to shorten the testing to exit when all byte lanes have errors
+// having this at 0 forces the testing to take place over the entire range every iteration,
+// hopefully ensuring an even load on the memory subsystem 
+#define EXIT_WHEN_ALL_LANES_HAVE_ERRORS 0
+
+#define DEFAULT_TEST_BURSTS 5 // FIXME: this is what works so far...// FIXME: was 7
+int dram_tune_use_bursts = DEFAULT_TEST_BURSTS;
+
+// dram_tune_rank_offset is used to offset the second area used in test_dram_mem_xor.
+//
+// If only a single-rank DIMM, the offset will be 256MB from the start of the first area,
+//  which is more than enough for the restricted looping/address range actually tested...
+//
+// If a 2-rank DIMM, the offset will be the size of a rank's address space, so the effect
+//  will be to have the first and second areas in different ranks on the same DIMM.
+//
+// So, we default this to single-rank, and it will be overridden when 2-ranks are detected.
+//
+
+// FIXME: ASSUME that we have DIMMS no less than 4GB in size
+
+// offset to first area that avoids any boot stuff in low range (below 256MB)
+#define AREA_BASE_OFFSET (1ULL << 28) // bit 28 always ON
+
+// offset to duplicate area; may coincide with rank 1 base address for 2-rank 4GB DIMM
+#define AREA_DUPE_OFFSET (1ULL << 31) // bit 31 always ON
+
+// defaults to DUPE, but will be set elsewhere to offset to next RANK if multi-rank DIMM
+static uint64_t dram_tune_rank_offset = AREA_DUPE_OFFSET; // default
+
+// defaults to 0, but will be set elsewhere to the address offset to next DIMM if multi-slot
+static uint64_t dram_tune_dimm_offset = 0; // default
+
+
+static int speed_bin_offset[3] = {25, 20, 15};
+static int speed_bin_winlen[3] = {70, 60, 60};
+
+static int
+get_speed_bin(bdk_node_t node, int lmc)
+{
+    uint32_t mts_speed = (libdram_get_freq_from_pll(node, lmc) / 1000000) * 2;
+    int ret = 0;
+
+    // FIXME: is this reasonable speed "binning"?
+    if (mts_speed >= 1700) {
+	if (mts_speed >= 2000)
+	    ret = 2;
+	else
+	    ret = 1;
+    }
+
+    debug_print("N%d.LMC%d: %s: returning bin %d for MTS %d\n", 
+		node, lmc, __FUNCTION__, ret, mts_speed);
+
+    return ret;
+}
+
+static int is_low_risk_offset(int speed_bin, int offset)
+{
+    return (_abs(offset) <= speed_bin_offset[speed_bin]);
+}
+static int is_low_risk_winlen(int speed_bin, int winlen)
+{
+    return (winlen >= speed_bin_winlen[speed_bin]);
+}
+
+#define ENABLE_PREFETCH 0
+#define ENABLE_WBIL2    1
+#define ENABLE_SBLKDTY  0
+
+#define BDK_SYS_CVMCACHE_INV_L2 "#0,c11,c1,#1"          // L2 Cache Invalidate
+#define BDK_CACHE_INV_L2(address) { asm volatile ("sys " BDK_SYS_CVMCACHE_INV_L2 ", %0" : : "r" (address)); }
+
+int dram_tuning_mem_xor(bdk_node_t node, int lmc, uint64_t p, uint64_t bitmask, uint64_t *xor_data)
+{
+    uint64_t p1, p2, d1, d2;
+    uint64_t v, v1;
+    uint64_t p2offset = 0x10000000/* was: dram_tune_rank_offset; */; // FIXME?
+    uint64_t datamask;
+    uint64_t xor;
+    uint64_t i, j, k;
+    uint64_t ii;
+    int errors = 0;
+    //uint64_t index;
+    uint64_t pattern1 = bdk_rng_get_random64();
+    uint64_t pattern2 = 0;
+    uint64_t bad_bits[2] = {0,0};
+
+#if ENABLE_SBLKDTY
+    BDK_CSR_MODIFY(c, node, BDK_L2C_CTL, c.s.dissblkdty = 0);
+#endif
+
+    // Byte lanes may be clear in the mask to indicate no testing on that lane.
+    datamask = bitmask;
+
+    // final address must include LMC and node
+    p |= (lmc<<7); /* Map address into proper interface */
+    p = bdk_numa_get_address(node, p); /* Map to node */
+
+    /* Add offset to both test regions to not clobber boot stuff
+     * when running from L2 for NAND boot.
+     */
+    p += AREA_BASE_OFFSET; // make sure base is out of the way of boot
+
+#define II_INC (1ULL << 29)
+#define II_MAX (1ULL << 31)
+#define K_INC  (1ULL << 14)
+#define K_MAX  (1ULL << 20)
+#define J_INC  (1ULL <<  9)
+#define J_MAX  (1ULL << 12)
+#define I_INC  (1ULL <<  3)
+#define I_MAX  (1ULL <<  7)
+
+    debug_print("N%d.LMC%d: dram_tuning_mem_xor: phys_addr=0x%lx\n", 
+              node, lmc, p);
+
+#if 0
+    int ix;
+    // add this loop to fill memory with the test pattern first
+    // loops are ordered so that only entire cachelines are written 
+    for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
+	for (k = 0; k < K_MAX; k += K_INC) {
+	    for (j = 0; j < J_MAX; j += J_INC) {
+		p1 = p + ii + k + j;
+		p2 = p1 + p2offset;
+		for (i = 0, ix = 0; i < I_MAX; i += I_INC, ix++) {
+
+		    v = dram_tune_test_pattern[ix];
+		    v1 = v; // write the same thing to both areas
+
+		    __bdk_dram_write64(p1 + i, v);
+		    __bdk_dram_write64(p2 + i, v1);
+
+		}
+#if ENABLE_WBIL2
+		BDK_CACHE_WBI_L2(p1);
+		BDK_CACHE_WBI_L2(p2);
+#endif
+	    }
+	}
+    } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
+#endif
+
+#if ENABLE_PREFETCH
+    BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
+    BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
+#endif
+
+    // loops are ordered so that only a single 64-bit slot is written to each cacheline at one time,
+    // then the cachelines are forced out; this should maximize read/write traffic
+    for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
+	for (k = 0; k < K_MAX; k += K_INC) {
+	    for (i = 0; i < I_MAX; i += I_INC) {
+		for (j = 0; j < J_MAX; j += J_INC) {
+
+		    p1 = p + ii + k + j;
+		    p2 = p1 + p2offset;
+
+#if ENABLE_PREFETCH
+                    if (j < (J_MAX - J_INC)) {
+                        BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE);
+                        BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE);
+                    }
+#endif
+	    
+		    v = pattern1 * (p1 + i);
+		    v1 = v; // write the same thing to both areas
+
+		    __bdk_dram_write64(p1 + i, v);
+		    __bdk_dram_write64(p2 + i, v1);
+
+#if ENABLE_WBIL2
+		    BDK_CACHE_WBI_L2(p1);
+		    BDK_CACHE_WBI_L2(p2);
+#endif
+		}
+	    }
+	}
+    } /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
+
+    BDK_DCACHE_INVALIDATE;
+
+    debug_print("N%d.LMC%d: dram_tuning_mem_xor: done INIT loop\n", 
+              node, lmc);
+
+    /* Make a series of passes over the memory areas. */
+
+    for (int burst = 0; burst < 1/* was: dram_tune_use_bursts*/; burst++)
+    {
+	uint64_t this_pattern = bdk_rng_get_random64();
+	pattern2 ^= this_pattern;
+
+        /* XOR the data with a random value, applying the change to both
+         * memory areas.
+         */
+#if ENABLE_PREFETCH
+	BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
+	BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
+#endif
+
+	for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
+	    for (k = 0; k < K_MAX; k += K_INC) {
+		for (i = 0; i < I_MAX; i += I_INC) { // FIXME: rearranged, did not make much difference?
+		    for (j = 0; j < J_MAX; j += J_INC) {
+
+			p1 = p + ii + k + j;
+			p2 = p1 + p2offset;
+
+#if ENABLE_PREFETCH
+                        if (j < (J_MAX - J_INC)) {
+                            BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE);
+                            BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE);
+                        }
+#endif
+	    
+			v  = __bdk_dram_read64(p1 + i) ^ this_pattern;
+			v1 = __bdk_dram_read64(p2 + i) ^ this_pattern;
+
+#if ENABLE_WBIL2
+			BDK_CACHE_INV_L2(p1);
+			BDK_CACHE_INV_L2(p2);
+#endif
+
+			__bdk_dram_write64(p1 + i, v);
+			__bdk_dram_write64(p2 + i, v1);
+
+#if ENABLE_WBIL2
+			BDK_CACHE_WBI_L2(p1);
+			BDK_CACHE_WBI_L2(p2);
+#endif
+		    }
+		}
+	    }
+	} /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
+
+        BDK_DCACHE_INVALIDATE;
+
+        debug_print("N%d.LMC%d: dram_tuning_mem_xor: done MODIFY loop\n", 
+                  node, lmc);
+
+#if ENABLE_PREFETCH
+	BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
+	BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
+#endif
+
+        /* Look for differences in the areas. If there is a mismatch, reset
+         * both memory locations with the same pattern. Failing to do so
+         * means that on all subsequent passes the pair of locations remain
+         * out of sync giving spurious errors.
+         */
+	// FIXME: change the loop order so that an entire cache line is compared at one time
+	// FIXME: this is so that a read error that occurs *anywhere* on the cacheline will be caught,
+	// FIXME: rather than comparing only 1 cacheline slot at a time, where an error on a different
+	// FIXME: slot will be missed that time around
+	// Does the above make sense?
+
+	for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
+	    for (k = 0; k < K_MAX; k += K_INC) {
+		for (j = 0; j < J_MAX; j += J_INC) {
+
+		    p1 = p + ii + k + j;
+		    p2 = p1 + p2offset;
+
+#if ENABLE_PREFETCH
+                    if (j < (J_MAX - J_INC)) {
+                        BDK_PREFETCH(p1 + J_INC, BDK_CACHE_LINE_SIZE);
+                        BDK_PREFETCH(p2 + J_INC, BDK_CACHE_LINE_SIZE);
+                    }
+#endif
+	    
+		    // process entire cachelines in the innermost loop
+		    for (i = 0; i < I_MAX; i += I_INC) {
+
+			v = ((p1 + i) * pattern1) ^ pattern2; // FIXME: this should predict what we find...???
+			d1 = __bdk_dram_read64(p1 + i);
+			d2 = __bdk_dram_read64(p2 + i);
+
+			xor = ((d1 ^ v) | (d2 ^ v)) & datamask; // union of error bits only in active byte lanes
+
+                        if (!xor)
+                            continue;
+
+                        // accumulate bad bits
+                        bad_bits[0] |= xor;
+                        //bad_bits[1] |= ~mpr_data1 & 0xffUL; // cannot do ECC here
+
+			int bybit = 1;
+			uint64_t bymsk = 0xffULL; // start in byte lane 0
+			while (xor != 0) {
+			    debug_print("ERROR(%03d): [0x%016lX] [0x%016lX]  expected 0x%016lX d1 %016lX d2 %016lX\n",
+					burst, p1, p2, v, d1, d2);
+			    if (xor & bymsk) { // error(s) in this lane
+				errors |= bybit; // set the byte error bit
+				xor &= ~bymsk; // clear byte lane in error bits
+				datamask &= ~bymsk; // clear the byte lane in the mask
+#if EXIT_WHEN_ALL_LANES_HAVE_ERRORS
+				if (datamask == 0) { // nothing left to do
+				    return errors; // completely done when errors found in all byte lanes in datamask
+				}
+#endif /* EXIT_WHEN_ALL_LANES_HAVE_ERRORS */
+			    }
+			    bymsk <<= 8; // move mask into next byte lane
+			    bybit <<= 1; // move bit into next byte position
+			}
+		    }
+#if ENABLE_WBIL2
+		    BDK_CACHE_WBI_L2(p1);
+		    BDK_CACHE_WBI_L2(p2);
+#endif
+		}
+	    }
+	} /* for (ii = 0; ii < (1ULL << 31); ii += (1ULL << 29)) */
+
+        debug_print("N%d.LMC%d: dram_tuning_mem_xor: done TEST loop\n", 
+                  node, lmc);
+
+    } /* for (int burst = 0; burst < dram_tune_use_bursts; burst++) */
+
+    if (xor_data != NULL) { // send the bad bits back...
+        xor_data[0] = bad_bits[0];
+        xor_data[1] = bad_bits[1]; // let it be zeroed
+    }
+
+#if ENABLE_SBLKDTY
+    BDK_CSR_MODIFY(c, node, BDK_L2C_CTL, c.s.dissblkdty = 1);
+#endif
+
+    return errors;
+}
+
+#undef II_INC
+#undef II_MAX
+
+#define EXTRACT(v, lsb, width) (((v) >> (lsb)) & ((1ull << (width)) - 1))
+#define LMCNO(address, xbits) (EXTRACT(address, 7, xbits) ^ EXTRACT(address, 20, xbits) ^ EXTRACT(address, 12, xbits))
+
+static int dram_tuning_mem_xor2(uint64_t p, uint64_t bitmask, int xbits)
+{
+    uint64_t p1, p2, d1, d2;
+    uint64_t v, vpred;
+    uint64_t p2offset = dram_tune_rank_offset; // FIXME?
+    uint64_t datamask;
+    uint64_t xor;
+    uint64_t ii;
+    uint64_t pattern1 = bdk_rng_get_random64();
+    uint64_t pattern2 = 0;
+    int errors = 0;
+    int errs_by_lmc[4] = { 0,0,0,0 };
+    int lmc;
+    uint64_t vbase, vincr;
+
+    // Byte lanes may be clear in the mask to indicate no testing on that lane.
+    datamask = bitmask;
+
+    /* Add offset to both test regions to not clobber boot stuff
+     * when running from L2 for NAND boot.
+     */
+    p += AREA_BASE_OFFSET; // make sure base is out of the way of boot
+
+    // move the multiplies outside the loop
+    vbase = p * pattern1;
+    vincr = 8 * pattern1;
+
+#define II_INC (1ULL <<  3)
+#define II_MAX (1ULL << 22) // stop where the core ID bits start
+
+    // walk the memory areas by 8-byte words
+    v = vbase;
+    for (ii = 0; ii < II_MAX; ii += II_INC) {
+
+	p1 = p + ii;
+	p2 = p1 + p2offset;
+
+	__bdk_dram_write64(p1, v);
+	__bdk_dram_write64(p2, v);
+
+        v += vincr;
+    }
+
+    __bdk_dram_flush_to_mem_range(p           , p            + II_MAX);
+    __bdk_dram_flush_to_mem_range(p + p2offset, p + p2offset + II_MAX);
+    BDK_DCACHE_INVALIDATE;
+
+    /* Make a series of passes over the memory areas. */
+
+    for (int burst = 0; burst < dram_tune_use_bursts; burst++)
+    {
+	uint64_t this_pattern = bdk_rng_get_random64();
+	pattern2 ^= this_pattern;
+
+        /* XOR the data with a random value, applying the change to both
+         * memory areas.
+         */
+#if 0
+	BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
+	BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
+#endif
+	for (ii = 0; ii < II_MAX; ii += II_INC) { // FIXME? extend the range of memory tested!!
+
+	    p1 = p + ii;
+	    p2 = p1 + p2offset;
+
+	    d1 = __bdk_dram_read64(p1) ^ this_pattern;
+	    d2 = __bdk_dram_read64(p2) ^ this_pattern;
+
+	    __bdk_dram_write64(p1, d1);
+	    __bdk_dram_write64(p2, d2);
+
+	}
+	__bdk_dram_flush_to_mem_range(p           , p            + II_MAX);
+	__bdk_dram_flush_to_mem_range(p + p2offset, p + p2offset + II_MAX);
+        BDK_DCACHE_INVALIDATE;
+
+        /* Look for differences in the areas. If there is a mismatch, reset
+         * both memory locations with the same pattern. Failing to do so
+         * means that on all subsequent passes the pair of locations remain
+         * out of sync giving spurious errors.
+         */
+#if 0
+	BDK_PREFETCH(p           , BDK_CACHE_LINE_SIZE);
+	BDK_PREFETCH(p + p2offset, BDK_CACHE_LINE_SIZE);
+#endif
+        vpred = vbase;
+	for (ii = 0; ii < II_MAX; ii += II_INC) {
+
+	    p1 = p + ii;
+	    p2 = p1 + p2offset;
+
+	    v = vpred ^ pattern2; // this should predict what we find...
+	    d1 = __bdk_dram_read64(p1);
+	    d2 = __bdk_dram_read64(p2);
+            vpred += vincr;
+
+	    xor = ((d1 ^ v) | (d2 ^ v)) & datamask; // union of error bits only in active byte lanes
+	    if (!xor) // no errors
+		continue;
+
+	    lmc = LMCNO(p1, xbits); // FIXME: LMC should be SAME for p1 and p2!!!
+	    if (lmc != (int)LMCNO(p2, xbits)) {
+		printf("ERROR: LMCs for addresses [0x%016lX] (%lld) and [0x%016lX] (%lld) differ!!!\n",
+		       p1, LMCNO(p1, xbits), p2, LMCNO(p2, xbits));
+	    }
+	    int bybit = 1;
+	    uint64_t bymsk = 0xffULL; // start in byte lane 0
+	    while (xor != 0) {
+		debug_print("ERROR(%03d): [0x%016lX] [0x%016lX]  expected 0x%016lX d1 %016lX d2 %016lX\n",
+			    burst, p1, p2, v, d1, d2);
+		if (xor & bymsk) { // error(s) in this lane
+		    errs_by_lmc[lmc] |= bybit; // set the byte error bit in the LMCs errors
+		    errors |= bybit; // set the byte error bit
+		    xor &= ~bymsk; // clear byte lane in error bits
+		    //datamask &= ~bymsk; // clear the byte lane in the mask
+		}
+		bymsk <<= 8; // move mask into next byte lane
+		bybit <<= 1; // move bit into next byte position
+	    } /* while (xor != 0) */
+	} /* for (ii = 0; ii < II_MAX; ii += II_INC) */
+    } /* for (int burst = 0; burst < dram_tune_use_bursts; burst++) */
+
+    // update the global LMC error states
+    for (lmc = 0; lmc < 4; lmc++) {
+	if (errs_by_lmc[lmc]) {
+	    bdk_atomic_fetch_and_bset64_nosync(&test_dram_byte_lmc_errs[lmc], errs_by_lmc[lmc]);
+	}
+    }
+
+    return errors;
+}
+
+#if 0
+static int dram_tuning_mem_rows(uint64_t p, uint64_t bitmask)
+{
+    uint64_t p1, p2, d1, d2;
+    uint64_t v, v1;
+    uint64_t p2offset = dram_tune_rank_offset; // FIXME?
+    uint64_t datamask;
+    uint64_t xor;
+    int i, j, k, ii;
+    int errors = 0;
+    int index;
+    uint64_t pattern1 = 0; // FIXME: maybe this could be from a table?
+    uint64_t pattern2;
+
+    // Byte lanes may be clear in the mask to indicate no testing on that lane.
+    datamask = bitmask;
+
+    /* Add offset to both test regions to not clobber boot stuff
+     * when running from L2 for NAND boot.
+     */
+    p += 0x10000000; // FIXME? was: 0x4000000; // make sure base is out of the way of cores for tuning
+
+    pattern2 = pattern1;
+    for (k = 0; k < (1 << 20); k += (1 << 14)) {
+	for (j = 0; j < (1 << 12); j += (1 << 9)) {
+	    for (i = 0; i < (1 << 7); i += 8) {
+		index = i + j + k;
+		p1 = p + index;
+		p2 = p1 + p2offset;
+
+		v = pattern2;
+		v1 = v; // write the same thing to same slot in both cachelines
+		pattern2 = ~pattern2; // flip bits for next slots
+
+		__bdk_dram_write64(p1, v);
+		__bdk_dram_write64(p2, v1);
+	    }
+#if 1
+	    BDK_CACHE_WBI_L2(p1);
+	    BDK_CACHE_WBI_L2(p2);
+#endif
+	}
+    }
+
+#if 0
+    __bdk_dram_flush_to_mem_range(p, p + (1ULL << 20)); // max_addr is start + where k stops...
+    __bdk_dram_flush_to_mem_range(p + p2offset, p + p2offset + (1ULL << 20)); // max_addr is start + where k stops...
+#endif
+    BDK_DCACHE_INVALIDATE;
+
+    /* Make a series of passes over the memory areas. */
+
+    for (int burst = 0; burst < dram_tune_use_bursts; burst++)
+    {
+        /* just read and flip the bits applying the change to both
+         * memory areas.
+         */
+	for (k = 0; k < (1 << 20); k += (1 << 14)) {
+	    for (j = 0; j < (1 << 12); j += (1 << 9)) {
+		for (i = 0; i < (1 << 7); i += 8) {
+		    index = i + j + k;
+		    p1 = p + index;
+		    p2 = p1 + p2offset;
+
+		    v  = ~__bdk_dram_read64(p1);
+		    v1 = ~__bdk_dram_read64(p2);
+
+		    __bdk_dram_write64(p1, v);
+		    __bdk_dram_write64(p2, v1);
+		}
+#if 1
+		BDK_CACHE_WBI_L2(p1);
+		BDK_CACHE_WBI_L2(p2);
+#endif
+	    }
+	}
+
+#if 0
+	__bdk_dram_flush_to_mem_range(p, p + (1ULL << 20)); // max_addr is start + where k stops...
+	__bdk_dram_flush_to_mem_range(p + p2offset, p + p2offset + (1ULL << 20)); // max_addr is start + where k stops...
+#endif
+        BDK_DCACHE_INVALIDATE;
+
+        /* Look for differences in the areas. If there is a mismatch, reset
+         * both memory locations with the same pattern. Failing to do so
+         * means that on all subsequent passes the pair of locations remain
+         * out of sync giving spurious errors.
+         */
+
+	// FIXME: change the loop order so that an entire cache line is compared at one time
+	// FIXME: this is so that a read error that occurs *anywhere* on the cacheline will be caught,
+	// FIXME: rather than comparing only 1 cacheline slot at a time, where an error on a different
+	// FIXME: slot will be missed that time around
+	// Does the above make sense?
+
+	pattern2 = ~pattern1; // slots have been flipped by the above loop
+
+	for (k = 0; k < (1 << 20); k += (1 << 14)) {
+	    for (j = 0; j < (1 << 12); j += (1 << 9)) {
+		for (i = 0; i < (1 << 7); i += 8) {
+		    index = i + j + k;
+		    p1 = p + index;
+		    p2 = p1 + p2offset;
+
+		    v = pattern2; // FIXME: this should predict what we find...???
+		    d1 = __bdk_dram_read64(p1);
+		    d2 = __bdk_dram_read64(p2);
+		    pattern2 = ~pattern2; // flip for next slot
+
+		    xor = ((d1 ^ v) | (d2 ^ v)) & datamask; // union of error bits only in active byte lanes
+
+		    int bybit = 1;
+		    uint64_t bymsk = 0xffULL; // start in byte lane 0
+		    while (xor != 0) {
+			debug_print("ERROR(%03d): [0x%016lX] [0x%016lX]  expected 0x%016lX d1 %016lX d2 %016lX\n",
+				  burst, p1, p2, v, d1, d2);
+			if (xor & bymsk) { // error(s) in this lane
+			    errors |= bybit; // set the byte error bit
+			    xor &= ~bymsk; // clear byte lane in error bits
+			    datamask &= ~bymsk; // clear the byte lane in the mask
+#if EXIT_WHEN_ALL_LANES_HAVE_ERRORS
+			    if (datamask == 0) { // nothing left to do
+				return errors; // completely done when errors found in all byte lanes in datamask
+			    }
+#endif /* EXIT_WHEN_ALL_LANES_HAVE_ERRORS */
+			}
+			bymsk <<= 8; // move mask into next byte lane
+			bybit <<= 1; // move bit into next byte position
+		    }
+		}
+	    }
+	}
+	pattern1 = ~pattern1; // flip the starting pattern for the next burst
+
+    } /* for (int burst = 0; burst < dram_tune_use_bursts; burst++) */
+    return errors;
+}
+#endif
+
+// cores to use
+#define DEFAULT_USE_CORES 44   // FIXME: was (1 << CORE_BITS)
+int dram_tune_use_cores = DEFAULT_USE_CORES; // max cores to use, override available
+int dram_tune_max_cores; // max cores available on a node
+#define CORE_SHIFT 22          // FIXME: offset into rank_address passed to test_dram_byte
+
+typedef void (*__dram_tuning_thread_t)(int arg, void *arg1);
+
+typedef struct
+{
+    bdk_node_t node;
+    int64_t num_lmcs;
+    uint64_t byte_mask;
+} test_dram_byte_info_t;
+
+static void dram_tuning_thread(int arg, void *arg1)
+{
+    test_dram_byte_info_t *test_info = arg1;
+    int core = arg;
+    uint64_t errs;
+    bdk_node_t node = test_info->node;
+    int num_lmcs, lmc;
+#if 0
+    num_lmcs = test_info->num_lmcs;
+    // map core numbers into hopefully equal groups per LMC
+    lmc = core % num_lmcs;
+#else
+    // FIXME: this code should allow running all the cores on a single LMC...
+    // if incoming num_lmcs > 0, then use as normal; if < 0 remap to a single LMC
+    if (test_info->num_lmcs >= 0) {
+	num_lmcs = test_info->num_lmcs;
+	// map core numbers into hopefully equal groups per LMC
+	lmc = core % num_lmcs;
+    } else {
+	num_lmcs = 1;
+	// incoming num_lmcs is (desired LMC - 10)
+	lmc = 10 + test_info->num_lmcs;
+    }
+#endif
+    uint64_t base_address = 0/* was: (lmc << 7); now done by callee */;
+    uint64_t bytemask = test_info->byte_mask;
+
+    /* Figure out our work memory range.
+     *
+     * Note: base_address above just provides the physical offset which determines
+     * specific LMC portions of the address space and does not have the node bits set.
+     */
+    //was: base_address  = bdk_numa_get_address(node, base_address); // map to node // now done by callee
+    base_address |= (core << CORE_SHIFT); // FIXME: also put full core into address
+    if (dram_tune_dimm_offset) { // if multi-slot in some way, choose a DIMM for the core
+	base_address |= (core & (1 << (num_lmcs >> 1))) ? dram_tune_dimm_offset : 0;
+    }
+
+    debug_print("Node %d, core %d, Testing area 1 at 0x%011lx, area 2 at 0x%011lx\n",
+		node, core, base_address + AREA_BASE_OFFSET,
+		base_address + AREA_BASE_OFFSET + dram_tune_rank_offset);
+
+    errs = dram_tuning_mem_xor(node, lmc, base_address, bytemask, NULL);
+    //errs = dram_tuning_mem_rows(base_address, bytemask);
+
+    /* Report that we're done */
+    debug_print("Core %d on LMC %d node %d done with test_dram_byte with 0x%lx errs\n",
+	      core, lmc, node, errs);
+
+    if (errs) {
+	bdk_atomic_fetch_and_bset64_nosync(&test_dram_byte_threads_errs, errs);
+	bdk_atomic_fetch_and_bset64_nosync(&test_dram_byte_lmc_errs[lmc], errs);
+    }
+
+    bdk_atomic_add64_nosync(&test_dram_byte_threads_done, 1);
+
+    return;
+}
+
+static void dram_tuning_thread2(int arg, void *arg1)
+{
+    test_dram_byte_info_t *test_info = arg1;
+    int core = arg;
+    uint64_t errs;
+    bdk_node_t node = test_info->node;
+    int num_lmcs = test_info->num_lmcs;
+
+    uint64_t base_address = 0; // 
+    uint64_t bytemask = test_info->byte_mask;
+
+    /* Figure out our work memory range.
+     *
+     * Note: base_address above just provides the physical offset which determines
+     * specific portions of the address space and does not have the node bits set.
+     */
+    base_address  = bdk_numa_get_address(node, base_address); // map to node
+    base_address |= (core << CORE_SHIFT); // FIXME: also put full core into address
+    if (dram_tune_dimm_offset) { // if multi-slot in some way, choose a DIMM for the core
+	base_address |= (core & 1) ? dram_tune_dimm_offset : 0;
+    }
+
+    debug_print("Node %d, core %d, Testing area 1 at 0x%011lx, area 2 at 0x%011lx\n",
+                node, core, base_address + AREA_BASE_OFFSET,
+                base_address + AREA_BASE_OFFSET + dram_tune_rank_offset);
+
+    errs = dram_tuning_mem_xor2(base_address, bytemask, (num_lmcs >> 1)); // 4->2, 2->1, 1->0
+    //errs = dram_tuning_mem_rows(base_address, bytemask);
+
+    /* Report that we're done */
+    debug_print("Core %d on LMC %d node %d done with test_dram_byte with 0x%lx errs\n",
+	      core, lmc, node, errs);
+
+    if (errs) {
+	bdk_atomic_fetch_and_bset64_nosync(&test_dram_byte_threads_errs, errs);
+	// FIXME: this will have been done already in the called test routine
+	//bdk_atomic_fetch_and_bset64_nosync(&test_dram_byte_lmc_errs[lmc], errs);
+    }
+
+    bdk_atomic_add64_nosync(&test_dram_byte_threads_done, 1);
+
+    return;
+}
+
+static int dram_tune_use_xor2 = 1; // FIXME: do NOT default to original mem_xor (LMC-based) code
+
+static int
+run_dram_tuning_threads(bdk_node_t node, int num_lmcs, uint64_t bytemask)
+{
+    test_dram_byte_info_t test_dram_byte_info;
+    test_dram_byte_info_t *test_info = &test_dram_byte_info;
+    int total_count = 0;
+    __dram_tuning_thread_t thread_p = (dram_tune_use_xor2) ? dram_tuning_thread2 : dram_tuning_thread;
+
+    test_info->node = node;
+    test_info->num_lmcs = num_lmcs;
+    test_info->byte_mask = bytemask;
+
+    // init some global data
+    bdk_atomic_set64(&test_dram_byte_threads_done, 0);
+    bdk_atomic_set64((int64_t *)&test_dram_byte_threads_errs, 0);
+    bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[0], 0);
+    bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[1], 0);
+    bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[2], 0);
+    bdk_atomic_set64((int64_t *)&test_dram_byte_lmc_errs[3], 0);
+
+    /* Start threads for cores on the node */
+    if (bdk_numa_exists(node)) {
+	debug_print("Starting %d threads for test_dram_byte\n", dram_tune_use_cores);
+	for (int core = 0; core < dram_tune_use_cores; core++) {
+	    if (bdk_thread_create(node, 0, thread_p, core, (void *)test_info, 0)) {
+		bdk_error("Failed to create thread %d for test_dram_byte\n", core);
+	    } else {
+		total_count++;
+	    }
+	}
+    }
+
+#if 0
+    /* Wait for threads to finish */
+    while (bdk_atomic_get64(&test_dram_byte_threads_done) < total_count)
+	bdk_thread_yield();
+#else
+#define TIMEOUT_SECS 5  // FIXME: long enough so a pass for a given setting will not print
+        /* Wait for threads to finish, with progress */
+        int cur_count;
+        uint64_t cur_time;
+        uint64_t period = bdk_clock_get_rate(bdk_numa_local(), BDK_CLOCK_TIME) * TIMEOUT_SECS; // FIXME? 
+        uint64_t timeout = bdk_clock_get_count(BDK_CLOCK_TIME) + period;
+        do {
+            bdk_thread_yield();
+            cur_count = bdk_atomic_get64(&test_dram_byte_threads_done);
+            cur_time = bdk_clock_get_count(BDK_CLOCK_TIME);
+            if (cur_time >= timeout) {
+                printf("Waiting for %d cores\n", total_count - cur_count);
+                timeout = cur_time + period;
+            }
+        } while (cur_count < total_count);
+#endif
+
+    // NOTE: this is the summary of errors across all LMCs
+    return (int)bdk_atomic_get64((int64_t *)&test_dram_byte_threads_errs);
+}
+
+/* These variables count the number of ECC errors. They should only be accessed atomically */
+extern int64_t __bdk_dram_ecc_single_bit_errors[];
+extern int64_t __bdk_dram_ecc_double_bit_errors[];
+
+#if 0
+// make the tuning test callable as a standalone
+int
+bdk_run_dram_tuning_test(int node)
+{
+    int num_lmcs = __bdk_dram_get_num_lmc(node);
+    const char *s;
+    int lmc, byte;
+    int errors;
+    uint64_t start_dram_dclk[4], start_dram_ops[4];
+    int save_use_bursts;
+
+    // check for the cores on this node, abort if not more than 1 // FIXME?
+    dram_tune_max_cores = bdk_get_num_running_cores(node);
+    if (dram_tune_max_cores < 2) {
+	//bdk_init_cores(node, 0);
+	printf("N%d: ERROR: not enough cores to run the DRAM tuning test.\n", node);
+	return 0;
+    }
+
+    // but use only a certain number of cores, at most what is available
+    if ((s = getenv("ddr_tune_use_cores")) != NULL) {
+	dram_tune_use_cores = strtoul(s, NULL, 0);
+	if (dram_tune_use_cores <= 0) // allow 0 or negative to mean all
+	    dram_tune_use_cores = dram_tune_max_cores;
+    }
+    if (dram_tune_use_cores > dram_tune_max_cores)
+	dram_tune_use_cores = dram_tune_max_cores;
+
+    // save the original bursts, so we can replace it with a better number for just testing
+    save_use_bursts = dram_tune_use_bursts;
+    dram_tune_use_bursts = 1500; // FIXME: hard code bursts for the test here...
+
+    // allow override of the test repeats (bursts) per thread create
+    if ((s = getenv("ddr_tune_use_bursts")) != NULL) {
+        dram_tune_use_bursts = strtoul(s, NULL, 10);
+    }
+
+    // allow override of the test mem_xor algorithm
+    if ((s = getenv("ddr_tune_use_xor2")) != NULL) {
+        dram_tune_use_xor2 = !!strtoul(s, NULL, 10);
+    }
+
+    // FIXME? consult LMC0 only
+    BDK_CSR_INIT(lmcx_config, node, BDK_LMCX_CONFIG(0));
+    if (lmcx_config.s.rank_ena) { // replace the default offset when there is more than 1 rank...
+	dram_tune_rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena + (num_lmcs/2));
+	ddr_print("N%d: run_dram_tuning_test: changing rank offset to 0x%lx\n", node, dram_tune_rank_offset);
+    }
+    if (lmcx_config.s.init_status & 0x0c) { // bit 2 or 3 set indicates 2 DIMMs
+	dram_tune_dimm_offset = 1ull << (28 + lmcx_config.s.pbank_lsb + (num_lmcs/2));
+	ddr_print("N%d: run_dram_tuning_test: changing dimm offset to 0x%lx\n", node, dram_tune_dimm_offset);
+    }
+    int ddr_interface_64b = !lmcx_config.s.mode32b;
+
+    // construct the bytemask
+    int bytes_todo = (ddr_interface_64b) ? 0xff : 0x0f; // FIXME: hack?
+    uint64_t bytemask = 0;
+    for (byte = 0; byte < 8; ++byte) {
+	uint64_t bitmask;
+	if (bytes_todo & (1 << byte)) {
+	    bitmask = ((!ddr_interface_64b) && (byte == 4)) ? 0x0f: 0xff;
+	    bytemask |= bitmask << (8*byte); // set the bytes bits in the bytemask 
+	}
+    } /* for (byte = 0; byte < 8; ++byte) */
+
+    // print current working values
+    ddr_print("N%d: run_dram_tuning_test: max %d cores, use %d cores, use %d bursts.\n",
+	      node, dram_tune_max_cores, dram_tune_use_cores, dram_tune_use_bursts);
+
+    // do the setup on active LMCs
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+	// record start cycle CSRs here for utilization measure
+	start_dram_dclk[lmc] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc));
+	start_dram_ops[lmc]  = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc));
+#if 0
+	bdk_atomic_set64(&__bdk_dram_ecc_single_bit_errors[lmc], 0);
+	bdk_atomic_set64(&__bdk_dram_ecc_double_bit_errors[lmc], 0);
+#else
+	__bdk_dram_ecc_single_bit_errors[lmc] = 0;
+	__bdk_dram_ecc_double_bit_errors[lmc] = 0;
+#endif
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    bdk_watchdog_poke();
+
+    // run the test(s)
+    // only 1 call should be enough, let the bursts, etc, control the load...  
+    errors = run_dram_tuning_threads(node, num_lmcs, bytemask);
+
+    /* Check ECC error counters after the test */
+    int64_t ecc_single = 0;
+    int64_t ecc_double = 0;
+    int64_t ecc_single_errs[4];
+    int64_t ecc_double_errs[4];
+
+    // finally, print the utilizations all together, and sum the ECC errors
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+	uint64_t dclk_diff = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc)) - start_dram_dclk[lmc];
+	uint64_t ops_diff  = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc)) - start_dram_ops[lmc];
+	uint64_t percent_x10 = ops_diff * 1000 / dclk_diff;
+	printf("N%d.LMC%d: ops %lu, cycles %lu, used %lu.%lu%%\n",
+		  node, lmc, ops_diff, dclk_diff, percent_x10 / 10, percent_x10 % 10);
+
+        ecc_single += (ecc_single_errs[lmc] = bdk_atomic_get64(&__bdk_dram_ecc_single_bit_errors[lmc]));
+        ecc_double += (ecc_double_errs[lmc] = bdk_atomic_get64(&__bdk_dram_ecc_double_bit_errors[lmc]));
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    /* Always print any ECC errors */
+    if (ecc_single || ecc_double) {
+        printf("Test \"%s\": ECC errors, %ld/%ld/%ld/%ld corrected, %ld/%ld/%ld/%ld uncorrected\n",
+	       "DRAM Tuning Test",
+	       ecc_single_errs[0], ecc_single_errs[1], ecc_single_errs[2], ecc_single_errs[3],
+	       ecc_double_errs[0], ecc_double_errs[1], ecc_double_errs[2], ecc_double_errs[3]);
+    }
+    if (errors || ecc_double || ecc_single) {
+	printf("Test \"%s\": FAIL: %ld single, %ld double, %d compare errors\n",
+	       "DRAM Tuning Test", ecc_single, ecc_double, errors);
+    }
+
+    // restore bursts
+    dram_tune_use_bursts = save_use_bursts;
+
+    return (errors + ecc_double + ecc_single);
+}
+#endif /* 0 */
+
+#define DEFAULT_SAMPLE_GRAN 3 // sample for errors every N offset values
+#define MIN_BYTE_OFFSET -63
+#define MAX_BYTE_OFFSET +63
+int dram_tune_use_gran = DEFAULT_SAMPLE_GRAN;
+
+static int
+auto_set_dll_offset(bdk_node_t node, int dll_offset_mode,
+		    int num_lmcs, int ddr_interface_64b,
+		    int do_tune)
+{
+    int byte_offset;
+    //unsigned short result[9];
+    int byte;
+    int byte_delay_start[4][9];
+    int byte_delay_count[4][9];
+    uint64_t byte_delay_windows [4][9];
+    int byte_delay_best_start[4][9];
+    int byte_delay_best_count[4][9];
+    //int this_rodt;
+    uint64_t ops_sum[4], dclk_sum[4];
+    uint64_t start_dram_dclk[4], stop_dram_dclk[4];
+    uint64_t start_dram_ops[4], stop_dram_ops[4];
+    int errors, tot_errors;
+    int lmc;
+    char *mode_str = (dll_offset_mode == 2) ? "Read" : "Write";
+    int mode_is_read = (dll_offset_mode == 2);
+    char *mode_blk = (dll_offset_mode == 2) ? " " : "";
+    int start_offset, end_offset, incr_offset;
+
+    int speed_bin = get_speed_bin(node, 0); // FIXME: just get from LMC0?
+    int low_risk_count = 0, needs_review_count = 0;
+
+    if (dram_tune_use_gran != DEFAULT_SAMPLE_GRAN) {
+	ddr_print2("N%d: Changing sample granularity from %d to %d\n",
+		  node, DEFAULT_SAMPLE_GRAN, dram_tune_use_gran);
+    }
+    // ensure sample is taken at 0
+    start_offset = MIN_BYTE_OFFSET - (MIN_BYTE_OFFSET % dram_tune_use_gran);
+    end_offset   = MAX_BYTE_OFFSET - (MAX_BYTE_OFFSET % dram_tune_use_gran);
+    incr_offset  = dram_tune_use_gran;
+
+    memset(ops_sum, 0, sizeof(ops_sum));
+    memset(dclk_sum, 0, sizeof(dclk_sum));
+    memset(byte_delay_start, 0, sizeof(byte_delay_start));
+    memset(byte_delay_count, 0, sizeof(byte_delay_count));
+    memset(byte_delay_windows,  0, sizeof(byte_delay_windows));
+    memset(byte_delay_best_start, 0, sizeof(byte_delay_best_start));
+    memset(byte_delay_best_count, 0, sizeof(byte_delay_best_count));
+
+    // FIXME? consult LMC0 only
+    BDK_CSR_INIT(lmcx_config, node, BDK_LMCX_CONFIG(0));
+    if (lmcx_config.s.rank_ena) { // replace the default offset when there is more than 1 rank...
+	dram_tune_rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena + (num_lmcs/2));
+	ddr_print2("N%d: Tuning multiple ranks per DIMM (rank offset 0x%lx).\n", node, dram_tune_rank_offset);
+    }
+    if (lmcx_config.s.init_status & 0x0c) { // bit 2 or 3 set indicates 2 DIMMs
+	dram_tune_dimm_offset = 1ull << (28 + lmcx_config.s.pbank_lsb + (num_lmcs/2));
+	ddr_print2("N%d: Tuning multiple DIMMs per channel (DIMM offset 0x%lx)\n", node, dram_tune_dimm_offset);
+    }
+
+    // FIXME? do this for LMC0 only
+    //BDK_CSR_INIT(comp_ctl2, node, BDK_LMCX_COMP_CTL2(0));
+    //this_rodt = comp_ctl2.s.rodt_ctl;
+
+    // construct the bytemask
+    int bytes_todo = (ddr_interface_64b) ? 0xff : 0x0f;
+    uint64_t bytemask = 0;
+    for (byte = 0; byte < 8; ++byte) {
+	if (bytes_todo & (1 << byte)) {
+	    bytemask |= 0xfful << (8*byte); // set the bytes bits in the bytemask 
+	}
+    } /* for (byte = 0; byte < 8; ++byte) */
+
+    // now loop through selected legal values for the DLL byte offset...
+
+    for (byte_offset = start_offset; byte_offset <= end_offset; byte_offset += incr_offset) {
+
+	// do the setup on active LMCs
+	for (lmc = 0; lmc < num_lmcs; lmc++) {
+	    change_dll_offset_enable(node, lmc, 0);
+
+	    // set all byte lanes at once
+	    load_dll_offset(node, lmc, dll_offset_mode, byte_offset, 10 /* All bytes at once */);
+	    // but then clear the ECC byte lane so it should be neutral for the test...
+	    load_dll_offset(node, lmc, dll_offset_mode, 0, 8);
+
+	    change_dll_offset_enable(node, lmc, 1);
+
+	    // record start cycle CSRs here for utilization measure
+	    start_dram_dclk[lmc] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc));
+	    start_dram_ops[lmc]  = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc));
+	} /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+	bdk_watchdog_poke();
+
+	// run the test(s)
+	// only 1 call should be enough, let the bursts, etc, control the load...  
+	tot_errors = run_dram_tuning_threads(node, num_lmcs, bytemask);
+
+	for (lmc = 0; lmc < num_lmcs; lmc++) {
+	    // record stop cycle CSRs here for utilization measure
+	    stop_dram_dclk[lmc] = BDK_CSR_READ(node, BDK_LMCX_DCLK_CNT(lmc));
+	    stop_dram_ops[lmc]  = BDK_CSR_READ(node, BDK_LMCX_OPS_CNT(lmc));
+
+	    // accumulate...
+	    ops_sum[lmc]  += stop_dram_ops[lmc]  - start_dram_ops[lmc];
+	    dclk_sum[lmc] += stop_dram_dclk[lmc] - start_dram_dclk[lmc];
+
+	    errors = test_dram_byte_lmc_errs[lmc];
+
+	    // check errors by byte, but not ECC
+	    for (byte = 0; byte < 8; ++byte) {
+		if (!(bytes_todo & (1 << byte))) // is this byte lane to be done
+		    continue; // no
+
+		byte_delay_windows[lmc][byte] <<= 1; // always put in a zero
+		if (errors & (1 << byte)) { // yes, an error in this byte lane
+		    byte_delay_count[lmc][byte] = 0; // stop now always
+		} else { // no error in this byte lane
+		    if (byte_delay_count[lmc][byte] == 0) { // first success, set run start
+			byte_delay_start[lmc][byte] = byte_offset;
+		    }
+		    byte_delay_count[lmc][byte] += incr_offset; // bump run length
+
+		    if (byte_delay_count[lmc][byte] > byte_delay_best_count[lmc][byte]) {
+			byte_delay_best_count[lmc][byte] = byte_delay_count[lmc][byte];
+			byte_delay_best_start[lmc][byte] = byte_delay_start[lmc][byte];
+		    }
+		    byte_delay_windows[lmc][byte] |= 1ULL; // for pass, put in a 1
+		}
+	    } /* for (byte = 0; byte < 8; ++byte) */
+
+	    // only print when there are errors and verbose...
+	    if (errors) {
+		debug_print("DLL %s Offset Test %3d: errors 0x%x\n",
+			    mode_str, byte_offset, errors);
+	    }
+	} /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    } /* for (byte_offset=-63; byte_offset<63; byte_offset += incr_offset) */
+
+    // done with testing, load up and/or print out the offsets we found...
+
+    // only when margining...
+    if (!do_tune) {
+	printf("  \n");
+	printf("-------------------------------------\n");
+#if 0
+	uint32_t mts_speed = (libdram_get_freq_from_pll(node, 0) * 2) / 1000000; // FIXME: sample LMC0
+	printf("N%d: Starting %s Timing Margining for %d MT/s.\n", node, mode_str, mts_speed);
+#else
+	printf("N%d: Starting %s Timing Margining.\n", node, mode_str);
+#endif
+	printf("  \n");
+    } /* if (!do_tune) */
+
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+#if 1
+	// FIXME FIXME
+	// FIXME: this just makes ECC always show 0
+	byte_delay_best_start[lmc][8] = start_offset;
+	byte_delay_best_count[lmc][8] = end_offset - start_offset + incr_offset;
+#endif
+
+	// disable offsets while we load...
+	change_dll_offset_enable(node, lmc, 0);
+
+	// only when margining...
+	if (!do_tune) {
+	    // print the heading
+	    printf("  \n");
+	    printf("N%d.LMC%d: %s Timing Margin     %s : ", node, lmc, mode_str, mode_blk);
+	    printf("     ECC/8 ");
+	    for (byte = 7; byte >= 0; byte--) {
+		printf("    Byte %d ", byte);
+	    }
+	    printf("\n");
+	} /* if (!do_tune) */
+
+	// print and load the offset values
+	// print the windows bit arrays
+	// only when margining...
+	if (!do_tune) {
+            printf("N%d.LMC%d: DLL %s Offset Amount %s : ", node, lmc, mode_str, mode_blk);
+        } else {
+            ddr_print("N%d.LMC%d: SW DLL %s Offset Amount %s : ", node, lmc, mode_str, mode_blk);
+        }
+	for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
+
+	    int count = byte_delay_best_count[lmc][byte];
+	    if (count == 0)
+		count = incr_offset; // should make non-tested ECC byte come out 0
+	   
+	    byte_offset =  byte_delay_best_start[lmc][byte] +
+		((count - incr_offset) / 2); // adj by incr
+
+	    if (!do_tune) { // do counting and special flag if margining
+                int will_need_review = !is_low_risk_winlen(speed_bin, (count - incr_offset)) &&
+		                       !is_low_risk_offset(speed_bin, byte_offset);
+
+                printf("%10d%c", byte_offset, (will_need_review) ? '<' :' ');
+
+		if (will_need_review)
+		    needs_review_count++;
+		else
+		    low_risk_count++;
+	    } else { // if just tuning, make the printout less lengthy
+                ddr_print("%5d ", byte_offset);
+            }
+
+	    // FIXME? should we be able to override this?
+	    if (mode_is_read) // for READ offsets, always store what we found
+		load_dll_offset(node, lmc, dll_offset_mode, byte_offset, byte);
+	    else // for WRITE offsets, always store 0
+		load_dll_offset(node, lmc, dll_offset_mode, 0, byte);
+
+	}
+	if (!do_tune) {
+            printf("\n");
+        } else {
+            ddr_print("\n");
+        }
+
+
+	// re-enable the offsets now that we are done loading
+	change_dll_offset_enable(node, lmc, 1);
+
+	// only when margining...
+	if (!do_tune) {
+	    // print the window sizes
+	    printf("N%d.LMC%d: DLL %s Window Length %s : ", node, lmc, mode_str, mode_blk);
+	    for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
+		int count = byte_delay_best_count[lmc][byte];
+		if (count == 0)
+		    count = incr_offset; // should make non-tested ECC byte come out 0
+
+		// do this again since the "needs review" test is an AND...
+		byte_offset =  byte_delay_best_start[lmc][byte] +
+		    ((count - incr_offset) / 2); // adj by incr
+
+		int will_need_review = !is_low_risk_winlen(speed_bin, (count - incr_offset)) &&
+		    !is_low_risk_offset(speed_bin, byte_offset);
+
+		printf("%10d%c", count - incr_offset, (will_need_review) ? '<' :' ');
+	    }
+	    printf("\n");
+
+	    // print the window extents
+	    printf("N%d.LMC%d: DLL %s Window Bounds %s : ", node, lmc, mode_str, mode_blk);
+	    for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
+		int start = byte_delay_best_start[lmc][byte];
+		int count = byte_delay_best_count[lmc][byte];
+		if (count == 0)
+		    count = incr_offset; // should make non-tested ECC byte come out 0
+		printf(" %3d to%3d ", start,
+		       start + count - incr_offset);
+	    }
+	    printf("\n");
+#if 0
+	    // FIXME: should have a way to force these out...
+	    // print the windows bit arrays
+	    printf("N%d.LMC%d: DLL %s Window Bitmap%s : ", node, lmc, mode_str, mode_blk);
+	    for (byte = 8; byte >= 0; --byte) { // print in "normal" reverse index order
+		printf("%010lx ", byte_delay_windows[lmc][byte]);
+	    }
+	    printf("\n");
+#endif
+	} /* if (!do_tune) */
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    // only when margining...
+    if (!do_tune) {
+	// print the Summary line(s) here
+	printf("  \n");
+	printf("N%d: %s Timing Margining Summary : %s ", node, mode_str,
+	       (needs_review_count > 0) ? "Needs Review" : "Low Risk");
+	if (needs_review_count > 0)
+	    printf("(%d)", needs_review_count); 
+	printf("\n");
+
+	// FIXME??? want to print here: "N0: %s Offsets have been applied already"
+
+	printf("-------------------------------------\n");
+	printf("  \n");
+    } /* if (!do_tune) */
+
+    // FIXME: we probably want this only when doing verbose...
+    // finally, print the utilizations all together
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+	uint64_t percent_x10 = ops_sum[lmc] * 1000 / dclk_sum[lmc];
+	ddr_print2("N%d.LMC%d: ops %lu, cycles %lu, used %lu.%lu%%\n",
+		  node, lmc, ops_sum[lmc], dclk_sum[lmc], percent_x10 / 10, percent_x10 % 10);
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    // FIXME: only when verbose, or only when there are errors?
+    // run the test one last time 
+    // print whether there are errors or not, but only when verbose...
+    bdk_watchdog_poke();
+    debug_print("N%d: %s: Start running test one last time\n", node, __FUNCTION__);
+    tot_errors = run_dram_tuning_threads(node, num_lmcs, bytemask);
+    debug_print("N%d: %s: Finished running test one last time\n", node, __FUNCTION__);
+    if (tot_errors)
+	ddr_print2("%s Timing Final Test: errors 0x%x\n", mode_str, tot_errors);
+
+    return (do_tune) ? tot_errors : !!(needs_review_count > 0);
+}
+
+#define USE_L2_WAYS_LIMIT 0 // non-zero to enable L2 ways limiting
+
+/*
+ * Automatically adjust the DLL offset for the data bytes
+ */
+int perform_dll_offset_tuning(bdk_node_t node, int dll_offset_mode, int do_tune)
+{
+    int ddr_interface_64b;
+    int save_ecc_ena[4];
+    bdk_lmcx_config_t lmc_config;
+    int lmc, num_lmcs = __bdk_dram_get_num_lmc(node);
+    const char *s;
+#if USE_L2_WAYS_LIMIT
+    int ways, ways_print = 0;
+#endif
+#if 0
+    int dram_tune_use_rodt = -1, save_rodt[4];
+    bdk_lmcx_comp_ctl2_t comp_ctl2;
+#endif
+    int loops = 1, loop;
+    uint64_t orig_coremask;
+    int errs = 0;
+
+    // enable any non-running cores on this node
+    orig_coremask = bdk_get_running_coremask(node);
+    ddr_print4("N%d: %s: Starting cores (mask was 0x%lx)\n",
+	      node, __FUNCTION__, orig_coremask);
+    bdk_init_cores(node, ~0ULL & ~orig_coremask);
+    dram_tune_max_cores = bdk_get_num_running_cores(node);
+
+    // but use only a certain number of cores, at most what is available
+    if ((s = getenv("ddr_tune_use_cores")) != NULL) {
+	dram_tune_use_cores = strtoul(s, NULL, 0);
+	if (dram_tune_use_cores <= 0) // allow 0 or negative to mean all
+	    dram_tune_use_cores = dram_tune_max_cores;
+    }
+    if (dram_tune_use_cores > dram_tune_max_cores)
+	dram_tune_use_cores = dram_tune_max_cores;
+
+    // see if we want to do the tuning more than once per LMC...
+    if ((s = getenv("ddr_tune_use_loops"))) {
+	loops = strtoul(s, NULL, 0);
+    }
+
+    // see if we want to change the granularity of the byte_offset sampling 
+    if ((s = getenv("ddr_tune_use_gran"))) {
+	dram_tune_use_gran = strtoul(s, NULL, 0);
+    }
+
+    // allow override of the test repeats (bursts) per thread create
+    if ((s = getenv("ddr_tune_use_bursts")) != NULL) {
+        dram_tune_use_bursts = strtoul(s, NULL, 10);
+    }
+
+#if 0
+    // allow override of Read ODT setting just during the tuning run(s)
+    if ((s = getenv("ddr_tune_use_rodt")) != NULL) {
+        int temp = strtoul(s, NULL, 10);
+	// validity check
+	if (temp >= 0 && temp <= 7)
+	    dram_tune_use_rodt = temp;
+    }
+#endif
+
+#if 0
+    // allow override of the test pattern
+    // FIXME: a bit simplistic...
+    if ((s = getenv("ddr_tune_use_pattern")) != NULL) {
+	int patno = strtoul(s, NULL, 10);
+	if (patno == 2)
+	    dram_tune_test_pattern = test_pattern_2;
+	else if (patno == 3)
+	    dram_tune_test_pattern = test_pattern_3;
+	else // all other values use default
+	    dram_tune_test_pattern = test_pattern_1;
+    }
+#endif
+
+    // allow override of the test mem_xor algorithm
+    if ((s = getenv("ddr_tune_use_xor2")) != NULL) {
+        dram_tune_use_xor2 = !!strtoul(s, NULL, 10);
+    }
+
+    // print current working values
+    ddr_print2("N%d: Tuning will use %d cores of max %d cores, and use %d repeats.\n",
+		node, dram_tune_use_cores, dram_tune_max_cores,
+		dram_tune_use_bursts);
+
+#if USE_L2_WAYS_LIMIT
+    // see if L2 ways are limited
+    if ((s = lookup_env_parameter("limit_l2_ways")) != NULL) {
+	ways = strtoul(s, NULL, 10);
+	ways_print = 1;
+    } else {
+	ways = bdk_l2c_get_num_assoc(node);
+    }
+#endif
+
+#if 0
+    // if RODT is to be overridden during tuning, note change
+    if (dram_tune_use_rodt >= 0) {
+	ddr_print("N%d: using RODT %d for tuning.\n",
+		  node, dram_tune_use_rodt);
+    }
+#endif
+
+    // FIXME? get flag from LMC0 only
+    lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(0));
+    ddr_interface_64b = !lmc_config.s.mode32b;
+
+    // do setup for each active LMC
+    debug_print("N%d: %s: starting LMCs setup.\n", node, __FUNCTION__);
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+
+#if 0
+	// if RODT change, save old and set new here...
+	if (dram_tune_use_rodt >= 0) {
+	    comp_ctl2.u = BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
+	    save_rodt[lmc] = comp_ctl2.s.rodt_ctl;
+	    comp_ctl2.s.rodt_ctl = dram_tune_use_rodt;
+	    DRAM_CSR_WRITE(node, BDK_LMCX_COMP_CTL2(lmc), comp_ctl2.u);
+	    BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
+	}
+#endif
+	/* Disable ECC for DRAM tests */
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+	save_ecc_ena[lmc] = lmc_config.s.ecc_ena;
+	lmc_config.s.ecc_ena = 0;
+	DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+#if USE_L2_WAYS_LIMIT
+    /* Disable l2 sets for DRAM testing */
+    limit_l2_ways(node, 0, ways_print);
+#endif
+
+    // testing is done on all LMCs simultaneously
+    // FIXME: for now, loop here to show what happens multiple times
+    for (loop = 0; loop < loops; loop++) {
+	/* Perform DLL offset tuning */
+	errs = auto_set_dll_offset(node, dll_offset_mode, num_lmcs, ddr_interface_64b, do_tune);
+    }
+
+#if USE_L2_WAYS_LIMIT
+    /* Restore the l2 set configuration */
+    limit_l2_ways(node, ways, ways_print);
+#endif
+
+    // perform cleanup on all active LMCs   
+    debug_print("N%d: %s: starting LMCs cleanup.\n", node, __FUNCTION__);
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+
+	/* Restore ECC for DRAM tests */
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+	lmc_config.s.ecc_ena = save_ecc_ena[lmc];
+	DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+#if 0
+	// if RODT change, restore old here...
+	if (dram_tune_use_rodt >= 0) {
+	    comp_ctl2.u = BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
+	    comp_ctl2.s.rodt_ctl = save_rodt[lmc];
+	    DRAM_CSR_WRITE(node, BDK_LMCX_COMP_CTL2(lmc), comp_ctl2.u);
+	    BDK_CSR_READ(node, BDK_LMCX_COMP_CTL2(lmc));
+	}
+#endif
+	// finally, see if there are any read offset overrides after tuning
+        // FIXME: provide a way to do write offsets also??
+        if (dll_offset_mode == 2) {
+            for (int by = 0; by < 9; by++) {
+                if ((s = lookup_env_parameter("ddr%d_tune_byte%d", lmc, by)) != NULL) {
+                    int dllro = strtoul(s, NULL, 10);
+                    change_dll_offset_enable(node, lmc, 0);
+                    load_dll_offset(node, lmc, /* read */2, dllro, by);
+                    change_dll_offset_enable(node, lmc, 1);
+                }
+            }
+        }
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    // finish up...
+
+#if 0
+    // if RODT was overridden during tuning, note restore
+    if (dram_tune_use_rodt >= 0) {
+	ddr_print("N%d: restoring RODT %d after tuning.\n",
+		  node, save_rodt[0]); // FIXME? use LMC0
+    }
+#endif
+
+    // put any cores on this node, that were not running at the start, back into reset
+    uint64_t reset_coremask = bdk_get_running_coremask(node) & ~orig_coremask;
+    if (reset_coremask) {
+	ddr_print4("N%d: %s: Stopping cores 0x%lx\n", node, __FUNCTION__,
+		  reset_coremask);
+	bdk_reset_cores(node, reset_coremask);
+    } else {
+	ddr_print4("N%d: %s: leaving cores set to 0x%lx\n", node, __FUNCTION__,
+		  orig_coremask);
+    }
+
+    return errs;
+
+} /* perform_dll_offset_tuning */
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+/////    HW-assist byte DLL offset tuning   //////
+
+#if 1
+// setup defaults for byte test pattern array
+// take these first two from the HRM section 6.9.13
+static const uint64_t byte_pattern_0[] = {
+    0xFFAAFFFFFF55FFFFULL, // GP0
+    0x55555555AAAAAAAAULL, // GP1
+    0xAA55AAAAULL,         // GP2
+};
+static const uint64_t byte_pattern_1[] = {
+    0xFBF7EFDFBF7FFEFDULL, // GP0
+    0x0F1E3C78F0E1C387ULL, // GP1
+    0xF0E1BF7FULL,         // GP2
+};
+// this is from Andrew via LFSR with PRBS=0xFFFFAAAA
+static const uint64_t byte_pattern_2[] = {
+    0xEE55AADDEE55AADDULL, // GP0
+    0x55AADDEE55AADDEEULL, // GP1
+    0x55EEULL,             // GP2
+};
+// this is from Mike via LFSR with PRBS=0x4A519909
+static const uint64_t byte_pattern_3[] = {
+    0x0088CCEE0088CCEEULL, // GP0
+    0xBB552211BB552211ULL, // GP1
+    0xBB00ULL,             // GP2
+};
+
+static const uint64_t *byte_patterns[] = {
+    byte_pattern_0, byte_pattern_1, byte_pattern_2, byte_pattern_3 // FIXME: use all we have
+};
+#define NUM_BYTE_PATTERNS ((int)(sizeof(byte_patterns)/sizeof(uint64_t *)))
+
+#define DEFAULT_BYTE_BURSTS 32 // FIXME: this is what what the longest test usually has
+int dram_tune_byte_bursts = DEFAULT_BYTE_BURSTS;
+#endif
+
+static void
+setup_hw_pattern(bdk_node_t node, int lmc, const uint64_t *pattern_p)
+{
+    /*
+      3) Setup GENERAL_PURPOSE[0-2] registers with the data pattern of choice.
+      a. GENERAL_PURPOSE0[DATA<63:0>] – sets the initial lower (rising edge) 64 bits of data.
+      b. GENERAL_PURPOSE1[DATA<63:0>] – sets the initial upper (falling edge) 64 bits of data.
+      c. GENERAL_PURPOSE2[DATA<15:0>] – sets the initial lower (rising edge <7:0>) and upper
+      (falling edge <15:8>) ECC data.
+    */
+    DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE0(lmc), pattern_p[0]);
+    DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE1(lmc), pattern_p[1]);
+    DRAM_CSR_WRITE(node, BDK_LMCX_GENERAL_PURPOSE2(lmc), pattern_p[2]);
+}
+
+#define DEFAULT_PRBS 0xFFFFAAAAUL /* FIXME: maybe try 0x4A519909UL */
+
+static void
+setup_lfsr_pattern(bdk_node_t node, int lmc, uint64_t data)
+{
+    uint32_t prbs;
+    const char *s;
+
+    if ((s = getenv("ddr_lfsr_prbs"))) {
+        prbs = strtoul(s, NULL, 0);
+    } else
+        prbs = DEFAULT_PRBS; // FIXME: from data arg?
+
+    /*
+      2) DBTRAIN_CTL[LFSR_PATTERN_SEL] = 1
+         here data comes from the LFSR generating a PRBS pattern
+         CHAR_CTL.EN = 0
+         CHAR_CTL.SEL = 0; // for PRBS
+         CHAR_CTL.DR = 1;
+         CHAR_CTL.PRBS = setup for whatever type of PRBS to send
+         CHAR_CTL.SKEW_ON = 1;
+    */
+    BDK_CSR_INIT(char_ctl, node, BDK_LMCX_CHAR_CTL(lmc));
+    char_ctl.s.en      = 0;
+    char_ctl.s.sel     = 0;
+    char_ctl.s.dr      = 1;
+    char_ctl.s.prbs    = prbs;
+    char_ctl.s.skew_on = 1;
+    DRAM_CSR_WRITE(node, BDK_LMCX_CHAR_CTL(lmc), char_ctl.u);
+}
+
+int
+choose_best_hw_patterns(bdk_node_t node, int lmc, int mode)
+{
+    int new_mode = mode;
+    const char *s;
+
+    switch (mode) {
+    case DBTRAIN_TEST: // always choose LFSR if chip supports it
+        if (! CAVIUM_IS_MODEL(CAVIUM_CN88XX)) {
+            int lfsr_enable = 1;
+            if ((s = getenv("ddr_allow_lfsr"))) { // override?
+                lfsr_enable = !!strtoul(s, NULL, 0);
+            }
+            if (lfsr_enable)
+                new_mode = DBTRAIN_LFSR;
+        }
+        break;
+    case DBTRAIN_DBI: // possibly can allow LFSR use?
+        break;
+    case DBTRAIN_LFSR: // forced already
+        if (CAVIUM_IS_MODEL(CAVIUM_CN88XX)) {
+            ddr_print("ERROR: illegal HW assist mode %d\n", mode);
+            new_mode = DBTRAIN_TEST;
+        }
+        break;
+    default:
+        ddr_print("ERROR: unknown HW assist mode %d\n", mode);
+    }
+
+    if (new_mode != mode)
+        VB_PRT(VBL_DEV2, "choose_best_hw_patterns: changing mode %d to %d\n", mode, new_mode);
+
+    return new_mode;
+}
+
+int
+run_best_hw_patterns(bdk_node_t node, int lmc, uint64_t phys_addr,
+                     int mode, uint64_t *xor_data)
+{
+    int pattern;
+    const uint64_t *pattern_p;
+    int errs, errors = 0;
+
+    // FIXME? always choose LFSR if chip supports it???
+    mode = choose_best_hw_patterns(node, lmc, mode);
+
+    if (mode == DBTRAIN_LFSR) {
+            setup_lfsr_pattern(node, lmc, 0);
+            errors = test_dram_byte_hw(node, lmc, phys_addr, mode, xor_data);
+            VB_PRT(VBL_DEV2, "%s: LFSR at A:0x%012lx errors 0x%x\n",
+                   __FUNCTION__, phys_addr, errors);
+    } else {
+        for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) {
+            pattern_p = byte_patterns[pattern];
+            setup_hw_pattern(node, lmc, pattern_p);
+
+            errs = test_dram_byte_hw(node, lmc, phys_addr, mode, xor_data);
+
+            VB_PRT(VBL_DEV2, "%s: PATTERN %d at A:0x%012lx errors 0x%x\n",
+                   __FUNCTION__, pattern, phys_addr, errs);
+
+            errors |= errs;
+        } /* for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) */
+    }
+    return errors;
+}
+
+static void
+hw_assist_test_dll_offset(bdk_node_t node, int dll_offset_mode,
+                          int lmc, int bytelane)
+{
+    int byte_offset, new_best_offset[9];
+    int rank_delay_start[4][9];
+    int rank_delay_count[4][9];
+    int rank_delay_best_start[4][9];
+    int rank_delay_best_count[4][9];
+    int errors[4], off_errors, tot_errors;
+    int num_lmcs = __bdk_dram_get_num_lmc(node);
+    int rank_mask, rankx, active_ranks;
+    int pattern;
+    const uint64_t *pattern_p;
+    int byte;
+    char *mode_str = (dll_offset_mode == 2) ? "Read" : "Write";
+    int pat_best_offset[9];
+    uint64_t phys_addr;
+    int pat_beg, pat_end;
+    int rank_beg, rank_end;
+    int byte_lo, byte_hi;
+    uint64_t hw_rank_offset;
+    // FIXME? always choose LFSR if chip supports it???
+    int mode = choose_best_hw_patterns(node, lmc, DBTRAIN_TEST);
+
+    if (bytelane == 0x0A) { // all bytelanes
+        byte_lo = 0;
+        byte_hi = 8;
+    } else { // just 1
+        byte_lo = byte_hi = bytelane;
+    }
+
+    BDK_CSR_INIT(lmcx_config, node, BDK_LMCX_CONFIG(lmc));
+    rank_mask = lmcx_config.s.init_status;
+    // this should be correct for 1 or 2 ranks, 1 or 2 DIMMs
+    hw_rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena + (num_lmcs/2));
+
+    debug_print("N%d: %s: starting LMC%d with rank offset 0x%lx\n",
+                node, __FUNCTION__, lmc, hw_rank_offset);
+
+    // start of pattern loop
+    // we do the set of tests for each pattern supplied...
+
+    memset(new_best_offset, 0, sizeof(new_best_offset));
+    for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) {
+
+	memset(pat_best_offset, 0, sizeof(pat_best_offset));
+
+        if (mode == DBTRAIN_TEST) {
+            pattern_p = byte_patterns[pattern];
+            setup_hw_pattern(node, lmc, pattern_p);
+        } else {
+            setup_lfsr_pattern(node, lmc, 0);
+        }
+
+	// now loop through all legal values for the DLL byte offset...
+
+#define BYTE_OFFSET_INCR 3 // FIXME: make this tunable?
+
+	tot_errors = 0;
+
+	memset(rank_delay_count, 0, sizeof(rank_delay_count));
+	memset(rank_delay_start, 0, sizeof(rank_delay_start));
+	memset(rank_delay_best_count, 0, sizeof(rank_delay_best_count));
+	memset(rank_delay_best_start, 0, sizeof(rank_delay_best_start));
+
+	for (byte_offset = -63; byte_offset < 64; byte_offset += BYTE_OFFSET_INCR) {
+
+	    // do the setup on the active LMC
+	    // set the bytelanes DLL offsets
+	    change_dll_offset_enable(node, lmc, 0);
+	    load_dll_offset(node, lmc, dll_offset_mode, byte_offset, bytelane); // FIXME? bytelane?
+	    change_dll_offset_enable(node, lmc, 1);
+
+	    bdk_watchdog_poke();
+
+	    // run the test on each rank
+	    // only 1 call per rank should be enough, let the bursts, loops, etc, control the load...
+	
+	    off_errors = 0; // errors for this byte_offset, all ranks
+
+            active_ranks = 0;
+
+	    for (rankx = 0; rankx < 4; rankx++) {
+                if (!(rank_mask & (1 << rankx)))
+                    continue;
+
+		phys_addr = hw_rank_offset * active_ranks;
+		// FIXME: now done by test_dram_byte_hw()
+                //phys_addr |= (lmc << 7);
+                //phys_addr = bdk_numa_get_address(node, phys_addr); // map to node
+
+                active_ranks++;
+
+                // NOTE: return is a now a bitmask of the erroring bytelanes..
+		errors[rankx] = test_dram_byte_hw(node, lmc, phys_addr, mode, NULL);
+
+                for (byte = byte_lo; byte <= byte_hi; byte++) { // do bytelane(s)
+
+                    // check errors
+                    if (errors[rankx] & (1 << byte)) { // yes, an error in the byte lane in this rank
+                        off_errors |= (1 << byte);
+
+                        ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: Address 0x%012lx errors 0x%x\n",
+                                   node, lmc, rankx, bytelane, mode_str,
+                                   byte_offset, phys_addr, errors[rankx]);
+
+                        if (rank_delay_count[rankx][byte] > 0) { // had started run
+                            ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: stopping a run here\n",
+                                       node, lmc, rankx, bytelane, mode_str, byte_offset);
+                            rank_delay_count[rankx][byte] = 0;   // stop now
+                        }
+                        // FIXME: else had not started run - nothing else to do?
+                    } else { // no error in the byte lane
+                        if (rank_delay_count[rankx][byte] == 0) { // first success, set run start
+                            ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: starting a run here\n",
+                                       node, lmc, rankx, bytelane, mode_str, byte_offset);
+                            rank_delay_start[rankx][byte] = byte_offset;
+                        }
+                        rank_delay_count[rankx][byte] += BYTE_OFFSET_INCR; // bump run length
+
+                        // is this now the biggest window?
+                        if (rank_delay_count[rankx][byte] > rank_delay_best_count[rankx][byte]) {
+                            rank_delay_best_count[rankx][byte] = rank_delay_count[rankx][byte];
+                            rank_delay_best_start[rankx][byte] = rank_delay_start[rankx][byte];
+                            debug_print("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: updating best to %d/%d\n",
+                                        node, lmc, rankx, bytelane, mode_str, byte_offset,
+                                        rank_delay_best_start[rankx][byte], rank_delay_best_count[rankx][byte]);
+                        }
+                    }
+                } /* for (byte = byte_lo; byte <= byte_hi; byte++) */
+	    } /* for (rankx = 0; rankx < 4; rankx++) */
+
+	    tot_errors |= off_errors;
+
+	} /* for (byte_offset = -63; byte_offset < 64; byte_offset += BYTE_OFFSET_INCR) */
+
+	// now choose the best byte_offsets for this pattern according to the best windows of the tested ranks
+        // calculate offset by constructing an average window from the rank windows
+        for (byte = byte_lo; byte <= byte_hi; byte++) {
+
+            pat_beg = -999;
+            pat_end = 999;
+
+            for (rankx = 0; rankx < 4; rankx++) {
+                if (!(rank_mask & (1 << rankx)))
+                    continue;
+
+                rank_beg = rank_delay_best_start[rankx][byte];
+                pat_beg = max(pat_beg, rank_beg);
+                rank_end = rank_beg + rank_delay_best_count[rankx][byte] - BYTE_OFFSET_INCR;
+                pat_end = min(pat_end, rank_end);
+
+                ddr_print5("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test:  Rank Window %3d:%3d\n",
+                           node, lmc, rankx, bytelane, mode_str, rank_beg, rank_end);
+
+            } /* for (rankx = 0; rankx < 4; rankx++) */
+
+            pat_best_offset[byte] = (pat_end + pat_beg) / 2;
+            ddr_print4("N%d.LMC%d: Bytelane %d DLL %s Offset Test:  Pattern %d Average %3d\n",
+                       node, lmc, byte, mode_str, pattern, pat_best_offset[byte]);
+
+#if 0
+            // FIXME: next print the window counts
+            sprintf(sbuffer, "N%d.LMC%d Pattern %d: DLL %s Offset Count ",
+                    node, lmc, pattern, mode_str);
+            printf("%-45s : ", sbuffer);
+            printf(" %3d", byte_delay_best_count);
+            printf("\n");
+#endif
+
+            new_best_offset[byte] += pat_best_offset[byte]; // sum the pattern averages
+        } /* for (byte = byte_lo; byte <= byte_hi; byte++) */
+    } /* for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) */
+    // end of pattern loop
+
+    ddr_print("N%d.LMC%d: HW DLL %s Offset Amount   : ",
+              node, lmc, mode_str);
+
+    for (byte = byte_hi; byte >= byte_lo; --byte) { // print in decending byte index order
+        new_best_offset[byte] = divide_nint(new_best_offset[byte], NUM_BYTE_PATTERNS); // create the new average NINT
+
+        // print the best offsets from all patterns
+
+        if (bytelane == 0x0A) // print just the offset of all the bytes
+            ddr_print("%5d ", new_best_offset[byte]);
+        else
+            ddr_print("(byte %d) %5d ", byte, new_best_offset[byte]);
+        
+
+#if 1
+        // done with testing, load up the best offsets we found...
+        change_dll_offset_enable(node, lmc, 0); // disable offsets while we load...
+        load_dll_offset(node, lmc, dll_offset_mode, new_best_offset[byte], byte);
+        change_dll_offset_enable(node, lmc, 1); // re-enable the offsets now that we are done loading
+#endif
+    } /* for (byte = byte_hi; byte >= byte_lo; --byte) */
+
+    ddr_print("\n");
+
+#if 0
+    // run the test one last time 
+    // print whether there are errors or not, but only when verbose...
+    tot_errors = run_test_dram_byte_threads(node, num_lmcs, bytemask);
+    printf("N%d.LMC%d: Bytelane %d DLL %s Offset Final Test: errors 0x%x\n",
+	   node, lmc, bytelane, mode_str, tot_errors);
+#endif
+}
+
+/*
+ * Automatically adjust the DLL offset for the selected bytelane using hardware-assist
+ */
+int perform_HW_dll_offset_tuning(bdk_node_t node, int dll_offset_mode, int bytelane)
+{
+    int save_ecc_ena[4];
+    bdk_lmcx_config_t lmc_config;
+    int lmc, num_lmcs = __bdk_dram_get_num_lmc(node);
+    const char *s;
+    //bdk_lmcx_comp_ctl2_t comp_ctl2;
+    int loops = 1, loop;
+
+    // see if we want to do the tuning more than once per LMC...
+    if ((s = getenv("ddr_tune_ecc_loops"))) {
+	loops = strtoul(s, NULL, 0);
+    }
+
+    // allow override of the test repeats (bursts)
+    if ((s = getenv("ddr_tune_byte_bursts")) != NULL) {
+        dram_tune_byte_bursts = strtoul(s, NULL, 10);
+    }
+
+    // print current working values
+    ddr_print2("N%d: H/W Tuning for bytelane %d will use %d loops, %d bursts, and %d patterns.\n",
+	      node, bytelane, loops, dram_tune_byte_bursts,
+	      NUM_BYTE_PATTERNS);
+
+    // FIXME? get flag from LMC0 only
+    lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(0));
+
+    // do once for each active LMC
+
+    for (lmc = 0; lmc < num_lmcs; lmc++) {
+
+	ddr_print4("N%d: H/W Tuning: starting LMC%d bytelane %d tune.\n", node, lmc, bytelane);
+
+	/* Enable ECC for the HW tests */
+	// NOTE: we do enable ECC, but the HW tests used will not generate "visible" errors
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+	save_ecc_ena[lmc] = lmc_config.s.ecc_ena;
+	lmc_config.s.ecc_ena = 1;
+	DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+
+	// testing is done on a single LMC at a time
+	// FIXME: for now, loop here to show what happens multiple times
+	for (loop = 0; loop < loops; loop++) {
+	    /* Perform DLL offset tuning */
+	    //auto_set_dll_offset(node,  1 /* 1=write */, lmc, bytelane);
+	    hw_assist_test_dll_offset(node,  2 /* 2=read */, lmc, bytelane);
+	}
+
+	// perform cleanup on active LMC   
+	ddr_print4("N%d: H/W Tuning: finishing LMC%d bytelane %d tune.\n", node, lmc, bytelane);
+
+	/* Restore ECC for DRAM tests */
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+	lmc_config.s.ecc_ena = save_ecc_ena[lmc];
+	DRAM_CSR_WRITE(node, BDK_LMCX_CONFIG(lmc), lmc_config.u);
+	lmc_config.u = BDK_CSR_READ(node, BDK_LMCX_CONFIG(lmc));
+
+	// finally, see if there are any read offset overrides after tuning
+	for (int by = 0; by < 9; by++) {
+	    if ((s = lookup_env_parameter("ddr%d_tune_byte%d", lmc, by)) != NULL) {
+		int dllro = strtoul(s, NULL, 10);
+		change_dll_offset_enable(node, lmc, 0);
+		load_dll_offset(node, lmc, 2 /* 2=read */, dllro, by);
+		change_dll_offset_enable(node, lmc, 1);
+	    }
+	}
+
+    } /* for (lmc = 0; lmc < num_lmcs; lmc++) */
+
+    // finish up...
+
+    return 0;
+
+} /* perform_HW_dll_offset_tuning */