From 95c48cbbb5b679ddbc2bd115becc04454e4adffd Mon Sep 17 00:00:00 2001 From: Arthur Heymans Date: Sat, 4 Nov 2017 08:07:06 +0100 Subject: nb/intel/x4x: Implement both read and write training This training find the optimal write DQ delay and read DQS delay settings. It does so on all lanes at the same time, like vendor (training each lane individually has poor results). The results are stored in the sysinfo struct and restored on next boots and S3 resume. This potentially increases stability as optimal settings are chosen and is more necessary for DDR3 raminit where the write DQS delays are leveled/variable due to the flyby topology. TESTED on Intel DG43GT with (2G + 1G) on each channel, see that the results are quite close to the safe original ones (that previous worked fine) and tested with memtest86+. Change-Id: Iacdc63b91b4705d1a80437314bfe55385ea5b6c1 Signed-off-by: Arthur Heymans Reviewed-on: https://review.coreboot.org/22329 Tested-by: build bot (Jenkins) Reviewed-by: Felix Held --- src/northbridge/intel/x4x/Makefile.inc | 1 + src/northbridge/intel/x4x/dq_dqs.c | 503 +++++++++++++++++++++++++++++++ src/northbridge/intel/x4x/raminit_ddr2.c | 19 +- src/northbridge/intel/x4x/x4x.h | 6 + 4 files changed, 522 insertions(+), 7 deletions(-) create mode 100644 src/northbridge/intel/x4x/dq_dqs.c (limited to 'src/northbridge/intel') diff --git a/src/northbridge/intel/x4x/Makefile.inc b/src/northbridge/intel/x4x/Makefile.inc index fb9dc1591b..29ece07526 100644 --- a/src/northbridge/intel/x4x/Makefile.inc +++ b/src/northbridge/intel/x4x/Makefile.inc @@ -22,6 +22,7 @@ romstage-y += raminit_ddr2.c romstage-y += ram_calc.c romstage-y += rcven.c romstage-y += raminit_tables.c +romstage-y += dq_dqs.c ramstage-y += acpi.c ramstage-y += ram_calc.c diff --git a/src/northbridge/intel/x4x/dq_dqs.c b/src/northbridge/intel/x4x/dq_dqs.c new file mode 100644 index 0000000000..5de8837a1e --- /dev/null +++ b/src/northbridge/intel/x4x/dq_dqs.c @@ -0,0 +1,503 @@ +/* + * This file is part of the coreboot project. + * + * Copyright (C) 2017-2018 Arthur Heymans + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include "x4x.h" +#include "iomap.h" + +static void print_dll_setting(const struct dll_setting *dll_setting, + u8 default_verbose) +{ + u8 debug_level = default_verbose ? BIOS_DEBUG : RAM_DEBUG; + + printk(debug_level, "%d.%d.%d.%d:%d.%d\n", dll_setting->coarse, + dll_setting->clk_delay, dll_setting->tap, + dll_setting->pi, dll_setting->db_en, + dll_setting->db_sel); +} + +struct db_limit { + u8 tap0; + u8 tap1; + u8 pi0; + u8 pi1; +}; + +static void set_db(const struct sysinfo *s, struct dll_setting *dq_dqs_setting) +{ + struct db_limit limit; + + switch (s->selected_timings.mem_clk) { + default: + case MEM_CLOCK_800MHz: + limit.tap0 = 3; + limit.tap1 = 10; + limit.pi0 = 2; + limit.pi1 = 3; + break; + case MEM_CLOCK_1066MHz: + limit.tap0 = 2; + limit.tap1 = 8; + limit.pi0 = 6; + limit.pi1 = 7; + break; + case MEM_CLOCK_1333MHz: + limit.tap0 = 3; + limit.tap1 = 11; + /* TO CHECK: Might be reverse since this makes little sense */ + limit.pi0 = 6; + limit.pi1 = 4; + break; + } + + if (dq_dqs_setting->tap < limit.tap0) { + dq_dqs_setting->db_en = 1; + dq_dqs_setting->db_sel = 1; + } else if ((dq_dqs_setting->tap == limit.tap0) + && (dq_dqs_setting->pi < limit.pi0)) { + dq_dqs_setting->db_en = 1; + dq_dqs_setting->db_sel = 1; + } else if (dq_dqs_setting->tap < limit.tap1) { + dq_dqs_setting->db_en = 0; + dq_dqs_setting->db_sel = 0; + } else if ((dq_dqs_setting->tap == limit.tap1) + && (dq_dqs_setting->pi < limit.pi1)) { + dq_dqs_setting->db_en = 0; + dq_dqs_setting->db_sel = 0; + } else { + dq_dqs_setting->db_en = 1; + dq_dqs_setting->db_sel = 0; + } +} + +const static u8 max_tap[3] = {12, 10, 13}; + +static int increment_dq_dqs(const struct sysinfo *s, + struct dll_setting *dq_dqs_setting) +{ + u8 max_tap_val = max_tap[s->selected_timings.mem_clk + - MEM_CLOCK_800MHz]; + + if (dq_dqs_setting->pi < 6) { + dq_dqs_setting->pi += 1; + } else if (dq_dqs_setting->tap < max_tap_val) { + dq_dqs_setting->pi = 0; + dq_dqs_setting->tap += 1; + } else if (dq_dqs_setting->clk_delay < 2) { + dq_dqs_setting->pi = 0; + dq_dqs_setting->tap = 0; + dq_dqs_setting->clk_delay += 1; + } else if (dq_dqs_setting->coarse < 1) { + dq_dqs_setting->pi = 0; + dq_dqs_setting->tap = 0; + dq_dqs_setting->clk_delay -= 1; + dq_dqs_setting->coarse += 1; + } else { + return CB_ERR; + } + set_db(s, dq_dqs_setting); + return CB_SUCCESS; +} + +#define WT_PATTERN_SIZE 80 + +static const u32 write_training_schedule[WT_PATTERN_SIZE] = { + 0xffffffff, 0x00000000, 0xffffffff, 0x00000000, + 0xffffffff, 0x00000000, 0xffffffff, 0x00000000, + 0xffffffff, 0x00000000, 0xffffffff, 0x00000000, + 0xffffffff, 0x00000000, 0xffffffff, 0x00000000, + 0xefefefef, 0x10101010, 0xefefefef, 0x10101010, + 0xefefefef, 0x10101010, 0xefefefef, 0x10101010, + 0xefefefef, 0x10101010, 0xefefefef, 0x10101010, + 0xefefefef, 0x10101010, 0xefefefef, 0x10101010, + 0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010, + 0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010, + 0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010, + 0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010, + 0x03030303, 0x04040404, 0x09090909, 0x10101010, + 0x21212121, 0x40404040, 0x81818181, 0x00000000, + 0x03030303, 0x04040404, 0x09090909, 0x10101010, + 0x21212121, 0x40404040, 0x81818181, 0x00000000, + 0xfdfdfdfd, 0xfafafafa, 0xf7f7f7f7, 0xeeeeeeee, + 0xdfdfdfdf, 0xbebebebe, 0x7f7f7f7f, 0xfefefefe, + 0xfdfdfdfd, 0xfafafafa, 0xf7f7f7f7, 0xeeeeeeee, + 0xdfdfdfdf, 0xbebebebe, 0x7f7f7f7f, 0xfefefefe, +}; + +enum training_modes { + SUCCEEDING = 0, + FAILING = 1 +}; + +static u8 test_dq_aligned(const struct sysinfo *s, + const u8 channel) +{ + u32 address; + int rank, lane; + u8 count, count1; + u8 data[8]; + u8 lane_error = 0; + + FOR_EACH_POPULATED_RANK_IN_CHANNEL(s->dimms, channel, rank) { + address = test_address(channel, rank); + for (count = 0; count < WT_PATTERN_SIZE; count++) { + for (count1 = 0; count1 < WT_PATTERN_SIZE; count1++) { + if ((count1 % 16) == 0) + MCHBAR32(0xf90) = 1; + const u32 pattern = + write_training_schedule[count1]; + write32((u32 *)address + 8 * count1, pattern); + write32((u32 *)address + 8 * count1 + 4, + pattern); + } + + const u32 good = write_training_schedule[count]; + write32(&data[0], read32((u32 *)address + 8 * count)); + write32(&data[4], + read32((u32 *)address + 8 * count + 4)); + FOR_EACH_BYTELANE(lane) { + u8 expected = (good >> ((lane % 4) * 8)) & 0xff; + if (data[lane] != expected) + lane_error |= 1 << lane; + } + } + } + return lane_error; +} + +#define CONSISTENCY 10 + +/* + * This function finds either failing or succeeding writes by increasing DQ. + * When it has found a failing or succeeding setting it will increase DQ + * another 10 times to make sure the result is consistent. + * This is probably done because lanes cannot be trained independent from + * each other. + */ +static int find_dq_limit(const struct sysinfo *s, const u8 channel, + struct dll_setting dq_setting[TOTAL_BYTELANES], + u8 dq_lim[TOTAL_BYTELANES], + const enum training_modes expected_result) +{ + int status = CB_SUCCESS; + int lane; + u8 test_result; + u8 pass_count[TOTAL_BYTELANES]; + u8 succes_mask = 0xff; + + printk(RAM_DEBUG, "Looking for %s writes on channel %d\n", + expected_result == FAILING ? "failing" : "succeeding", channel); + memset(pass_count, 0, sizeof(pass_count)); + + while(succes_mask) { + test_result = test_dq_aligned(s, channel); + FOR_EACH_BYTELANE(lane) { + if (((test_result >> lane) & 1) != expected_result) { + status = increment_dq_dqs(s, &dq_setting[lane]); + dqset(channel, lane, &dq_setting[lane]); + dq_lim[lane]++; + } else if (pass_count[lane] < CONSISTENCY) { + status = increment_dq_dqs(s, &dq_setting[lane]); + dqset(channel, lane, &dq_setting[lane]); + dq_lim[lane]++; + pass_count[lane]++; + } else if (pass_count[lane] == CONSISTENCY) { + succes_mask &= ~(1 << lane); + } + if (status == CB_ERR) { + printk(BIOS_CRIT, "Could not find a case of %s " + "writes on CH%d, lane %d\n", + expected_result == FAILING ? "failing" + : "succeeding", channel, lane); + return CB_ERR; + } + } + } + return CB_SUCCESS; +} + +/* + * This attempts to find the ideal delay for DQ to account for the skew between + * the DQ and the DQS signal. + * The training works this way: + * - start from the DQS delay values (DQ is always later than DQS) + * - increment the DQ delay until a succeeding write is found on all bytelayes, + * on all ranks on a channel and save these values + * - again increment the DQ delay until write start to fail on all bytelanes and + * save that value + * - use the mean between the saved succeeding and failing value + * - note: bytelanes cannot be trained independently, so the delays need to be + * adjusted and tested for all of them at the same time + */ +int do_write_training(struct sysinfo *s) +{ + int i; + u8 channel, lane; + u8 dq_lower[TOTAL_BYTELANES]; + u8 dq_upper[TOTAL_BYTELANES]; + struct dll_setting dq_setting[TOTAL_BYTELANES]; + u8 dq_average; + u32 dq_absolute; + + printk(BIOS_DEBUG, "Starting DQ write training\n"); + + FOR_EACH_POPULATED_CHANNEL(s->dimms, channel) { + printk(BIOS_DEBUG, "Doing DQ write training on CH%d\n", channel); + + dq_average = 0; + dq_absolute = 0; + /* Start all lanes at DQS values */ + FOR_EACH_BYTELANE(lane) { + dqset(channel, lane, &s->dqs_settings[channel][lane]); + s->dq_settings[channel][lane] = s->dqs_settings[channel][lane]; + } + memset(dq_lower, 0, sizeof(dq_lower)); + /* Start from DQS settings */ + memcpy(dq_setting, s->dqs_settings[channel], sizeof(dq_setting)); + + if (find_dq_limit(s, channel, dq_setting, dq_lower, + SUCCEEDING)) { + printk(BIOS_CRIT, + "Could not find working lower limit DQ setting\n"); + return CB_ERR; + } + + memcpy(dq_upper, dq_lower, sizeof(dq_lower)); + + if (find_dq_limit(s, channel, dq_setting, dq_upper, + FAILING)) { + printk(BIOS_WARNING, + "Could not find failing upper limit DQ setting\n"); + return CB_ERR; + } + + FOR_EACH_BYTELANE(lane) { + dq_lower[lane] -= CONSISTENCY - 1; + dq_upper[lane] -= CONSISTENCY - 1; + u8 dq_center = (dq_upper[lane] + dq_lower[lane]) / 2; + + printk(RAM_DEBUG, "Centered value for DQ DLL:" + " ch%d, lane %d, #steps = %d\n", + channel, lane, dq_center); + for (i = 0; i < dq_center; i++) { + /* Should never happen */ + if (increment_dq_dqs(s, &s->dq_settings[channel][lane]) + == CB_ERR) + printk(BIOS_ERR, + "Huh? write training overflowed!!\n"); + } + } + + /* Reset DQ DLL settings and increment with centered value*/ + printk(BIOS_DEBUG, "Final DQ timings on CH%d\n", channel); + FOR_EACH_BYTELANE(lane) { + printk(BIOS_DEBUG, "\tlane%d: ", lane); + print_dll_setting(&s->dq_settings[channel][lane], 1); + dqset(channel, lane, &s->dq_settings[channel][lane]); + } + } + printk(BIOS_DEBUG, "Done DQ write training\n"); + return CB_SUCCESS; +} + +#define RT_PATTERN_SIZE 40 + +static const u32 read_training_schedule[RT_PATTERN_SIZE] = { + 0xffffffff, 0x00000000, 0xffffffff, 0x00000000, + 0xffffffff, 0x00000000, 0xffffffff, 0x00000000, + 0xefefefef, 0x10101010, 0xefefefef, 0x10101010, + 0xefefefef, 0x10101010, 0xefefefef, 0x10101010, + 0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010, + 0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010, + 0x03030303, 0x04040404, 0x09090909, 0x10101010, + 0x21212121, 0x40404040, 0x81818181, 0x00000000, + 0xfdfdfdfd, 0xfafafafa, 0xf7f7f7f7, 0xeeeeeeee, + 0xdfdfdfdf, 0xbebebebe, 0x7f7f7f7f, 0xfefefefe +}; + +static int rt_increment_dqs(struct rt_dqs_setting *setting) +{ + if (setting->pi < 7) { + setting->pi++; + } else if (setting->tap < 14) { + setting->pi = 0; + setting->tap++; + } else { + return CB_ERR; + } + return CB_SUCCESS; +} + +static u8 test_dqs_aligned(const struct sysinfo *s, const u8 channel) +{ + int i, rank, lane; + volatile u8 data[8]; + u32 address; + u8 bytelane_error = 0; + + FOR_EACH_POPULATED_RANK_IN_CHANNEL(s->dimms, channel, rank) { + address = test_address(channel, rank); + for (i = 0; i < RT_PATTERN_SIZE; i++) { + const u32 good = read_training_schedule[i]; + write32(&data[0], read32((u32 *)address + i * 8)); + write32(&data[4], read32((u32 *)address + i * 8 + 4)); + + FOR_EACH_BYTELANE(lane) { + if (data[lane] != (good & 0xff)) + bytelane_error |= 1 << lane; + } + } + } + return bytelane_error; +} + +static int rt_find_dqs_limit(struct sysinfo *s, u8 channel, + struct rt_dqs_setting dqs_setting[TOTAL_BYTELANES], + u8 dqs_lim[TOTAL_BYTELANES], + const enum training_modes expected_result) +{ + int lane; + u8 test_result; + int status = CB_SUCCESS; + + FOR_EACH_BYTELANE(lane) + rt_set_dqs(channel, lane, 0, &dqs_setting[lane]); + + while(status == CB_SUCCESS) { + test_result = test_dqs_aligned(s, channel); + if (test_result == (expected_result == SUCCEEDING ? 0 : 0xff)) + return CB_SUCCESS; + FOR_EACH_BYTELANE(lane) { + if (((test_result >> lane) & 1) != expected_result) { + status = rt_increment_dqs(&dqs_setting[lane]); + dqs_lim[lane]++; + rt_set_dqs(channel, lane, 0, &dqs_setting[lane]); + } + } + } + + if (expected_result == SUCCEEDING) { + printk(BIOS_CRIT, + "Could not find RT DQS setting\n"); + return CB_ERR; + } else { + printk(RAM_DEBUG, + "Read succeeded over all DQS" + " settings, continuing\n"); + return CB_SUCCESS; + } +} + +#define RT_LOOPS 3 + +/* + * This attempts to find the ideal delay for DQS on reads (rx). + * The training works this way: + * - start from the lowest possible delay (0) on all bytelanes + * - increment the DQS rx delays until a succeeding write is found on all + * bytelayes, on all ranks on a channel and save these values + * - again increment the DQS rx delay until write start to fail on all bytelanes + * and save that value + * - use the mean between the saved succeeding and failing value + * - note0: bytelanes cannot be trained independently, so the delays need to be + * adjusted and tested for all of them at the same time + * - note1: this memory controller appears to have per rank registers for these + * DQS rx delays, but only the one rank 0 seems to be used for all of them + */ +int do_read_training(struct sysinfo *s) +{ + int loop, channel, i, lane, rank; + u32 address, content; + u8 dqs_lower[TOTAL_BYTELANES]; + u8 dqs_upper[TOTAL_BYTELANES]; + struct rt_dqs_setting dqs_setting[TOTAL_BYTELANES]; + u16 saved_dqs_center[TOTAL_CHANNELS][TOTAL_BYTELANES]; + + memset(saved_dqs_center, 0 , sizeof(saved_dqs_center)); + + printk(BIOS_DEBUG, "Starting DQS read training\n"); + + for (loop = 0; loop < RT_LOOPS; loop++) { + FOR_EACH_POPULATED_CHANNEL(s->dimms, channel) { + printk(RAM_DEBUG, "Doing DQS read training on CH%d\n", + channel); + + /* Write pattern to strobe address */ + FOR_EACH_POPULATED_RANK_IN_CHANNEL(s->dimms, channel, rank) { + address = test_address(channel, rank); + for (i = 0; i < RT_PATTERN_SIZE; i++) { + content = read_training_schedule[i]; + write32((u32 *)address + 8 * i, content); + write32((u32 *)address + 8 * i + 4, content); + } + } + + memset(dqs_lower, 0, sizeof(dqs_lower)); + memset(&dqs_setting, 0, sizeof(dqs_setting)); + if (rt_find_dqs_limit(s, channel, dqs_setting, dqs_lower, + SUCCEEDING)) { + printk(BIOS_CRIT, + "Could not find working lower limit DQS setting\n"); + return CB_ERR; + } + + FOR_EACH_BYTELANE(lane) + dqs_upper[lane] = dqs_lower[lane]; + + if (rt_find_dqs_limit(s, channel, dqs_setting, dqs_upper, + FAILING)) { + printk(BIOS_CRIT, + "Could not find failing upper limit DQ setting\n"); + return CB_ERR; + } + + printk(RAM_DEBUG, "Centered values, loop %d:\n", loop); + FOR_EACH_BYTELANE(lane) { + u8 center = (dqs_lower[lane] + dqs_upper[lane]) / 2; + printk(RAM_DEBUG, "\t lane%d: #%d\n", lane, center); + saved_dqs_center[channel][lane] += center; + } + } /* END FOR_EACH_POPULATED_CHANNEL */ + } /* end RT_LOOPS */ + + memset(s->rt_dqs, 0, sizeof(s->rt_dqs)); + + FOR_EACH_POPULATED_CHANNEL(s->dimms, channel) { + printk(BIOS_DEBUG, "Final timings on CH%d:\n", channel); + FOR_EACH_BYTELANE(lane) { + saved_dqs_center[channel][lane] /= RT_LOOPS; + while (saved_dqs_center[channel][lane]--) { + if(rt_increment_dqs(&s->rt_dqs[channel][lane]) + == CB_ERR) + /* Should never happen */ + printk(BIOS_ERR, + "Huh? read training overflowed!!\n"); + } + FOR_EACH_POPULATED_RANK_IN_CHANNEL(s->dimms, channel, rank) + rt_set_dqs(channel, lane, rank, + &s->rt_dqs[channel][lane]); + printk(BIOS_DEBUG, "\tlane%d: %d.%d\n", + lane, s->rt_dqs[channel][lane].tap, + s->rt_dqs[channel][lane].pi); + } + } + printk(BIOS_DEBUG, "Done DQS read training\n"); + return CB_SUCCESS; +} diff --git a/src/northbridge/intel/x4x/raminit_ddr2.c b/src/northbridge/intel/x4x/raminit_ddr2.c index b9675836e2..a36242b2d8 100644 --- a/src/northbridge/intel/x4x/raminit_ddr2.c +++ b/src/northbridge/intel/x4x/raminit_ddr2.c @@ -293,7 +293,7 @@ static void cmdset(u8 ch, const struct dll_setting *setting) * All finer DQ and DQS DLL settings are set to the same value * for each rank in a channel, while coarse is common. */ -static void dqsset(u8 ch, u8 lane, const struct dll_setting *setting) +void dqsset(u8 ch, u8 lane, const struct dll_setting *setting) { int rank; @@ -320,7 +320,7 @@ static void dqsset(u8 ch, u8 lane, const struct dll_setting *setting) } } -static void dqset(u8 ch, u8 lane, const struct dll_setting *setting) +void dqset(u8 ch, u8 lane, const struct dll_setting *setting) { int rank; MCHBAR32(0x400 * ch + 0x5fc) = (MCHBAR32(0x400 * ch + 0x5fc) @@ -346,12 +346,12 @@ static void dqset(u8 ch, u8 lane, const struct dll_setting *setting) } } -static void rt_set_dqs(u8 channel, u8 lane, u8 rank, +void rt_set_dqs(u8 channel, u8 lane, u8 rank, struct rt_dqs_setting *dqs_setting) { u16 saved_tap = MCHBAR16(0x540 + 0x400 * channel + lane * 4); u16 saved_pi = MCHBAR16(0x542 + 0x400 * channel + lane * 4); - printk(RAM_SPEW, "RT DQS: ch%d, L%d, %d.%d\n", channel, lane, + printk(RAM_SPEW, "RT DQS: ch%d, r%d, L%d: %d.%d\n", channel, rank, lane, dqs_setting->tap, dqs_setting->pi); @@ -1680,9 +1680,14 @@ void raminit_ddr2(struct sysinfo *s, int fast_boot) // XXX tRD - // XXX Write training - - // XXX Read training + if (!fast_boot) { + if (s->selected_timings.mem_clk > MEM_CLOCK_667MHz) { + if(do_write_training(s)) + die("DQ write training failed!"); + } + if (do_read_training(s)) + die("DQS read training failed!"); + } // DRADRB dradrb_ddr2(s); diff --git a/src/northbridge/intel/x4x/x4x.h b/src/northbridge/intel/x4x/x4x.h index 4ee0c56abc..5017aa030a 100644 --- a/src/northbridge/intel/x4x/x4x.h +++ b/src/northbridge/intel/x4x/x4x.h @@ -366,6 +366,12 @@ void rcven(struct sysinfo *s); u32 fsb2mhz(u32 speed); u32 ddr2mhz(u32 speed); u32 test_address(int channel, int rank); +void dqsset(u8 ch, u8 lane, const struct dll_setting *setting); +void dqset(u8 ch, u8 lane, const struct dll_setting *setting); +void rt_set_dqs(u8 channel, u8 lane, u8 rank, + struct rt_dqs_setting *dqs_setting); +int do_write_training(struct sysinfo *s); +int do_read_training(struct sysinfo *s); extern const struct dll_setting default_ddr2_667_ctrl[7]; extern const struct dll_setting default_ddr2_800_ctrl[7]; -- cgit v1.2.3