/*
 * This file is part of the coreboot project.
 *
 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

/******************************************************************************
 Description: Receiver En and DQS Timing Training feature for DDR 2 MCT
******************************************************************************/

static void dqsTrainRcvrEn_SW(struct MCTStatStruc *pMCTstat,
				struct DCTStatStruc *pDCTstat, u8 Pass);
static u8 mct_SavePassRcvEnDly_D(struct DCTStatStruc *pDCTstat,
					u8 rcvrEnDly, u8 Channel,
					u8 receiver, u8 Pass);
static u8 mct_CompareTestPatternQW0_D(struct MCTStatStruc *pMCTstat,
					struct DCTStatStruc *pDCTstat,
					u32 addr, u8 channel,
					u8 pattern, u8 Pass);
static void mct_InitDQSPos4RcvrEn_D(struct MCTStatStruc *pMCTstat,
					 struct DCTStatStruc *pDCTstat);
static void InitDQSPos4RcvrEn_D(struct MCTStatStruc *pMCTstat,
				struct DCTStatStruc *pDCTstat, u8 Channel);
static void CalcEccDQSRcvrEn_D(struct MCTStatStruc *pMCTstat,
				struct DCTStatStruc *pDCTstat, u8 Channel);
static void mct_SetFinalRcvrEnDly_D(struct DCTStatStruc *pDCTstat,
				u8 RcvrEnDly, u8 where,
				u8 Channel, u8 Receiver,
				u32 dev, u32 index_reg,
				u8 Addl_Index, u8 Pass);
static void mct_SetMaxLatency_D(struct DCTStatStruc *pDCTstat, u8 Channel, u8 DQSRcvEnDly);
static void fenceDynTraining_D(struct MCTStatStruc *pMCTstat,
			struct DCTStatStruc *pDCTstat, u8 dct);
static void mct_DisableDQSRcvEn_D(struct DCTStatStruc *pDCTstat);


/* Warning:  These must be located so they do not cross a logical 16-bit
   segment boundary! */
static const u32 TestPattern0_D[] = {
	0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
	0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
	0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
	0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
};
static const u32 TestPattern1_D[] = {
	0x55555555, 0x55555555, 0x55555555, 0x55555555,
	0x55555555, 0x55555555, 0x55555555, 0x55555555,
	0x55555555, 0x55555555, 0x55555555, 0x55555555,
	0x55555555, 0x55555555, 0x55555555, 0x55555555,
};
static const u32 TestPattern2_D[] = {
	0x12345678, 0x87654321, 0x23456789, 0x98765432,
	0x59385824, 0x30496724, 0x24490795, 0x99938733,
	0x40385642, 0x38465245, 0x29432163, 0x05067894,
	0x12349045, 0x98723467, 0x12387634, 0x34587623,
};

static void SetupRcvrPattern(struct MCTStatStruc *pMCTstat,
		struct DCTStatStruc *pDCTstat, u32 *buffer, u8 pass)
{
	/*
	 * 1. Copy the alpha and Beta patterns from ROM to Cache,
	 *     aligning on 16 byte boundary
	 * 2. Set the ptr to DCTStatstruc.PtrPatternBufA for Alpha
	 * 3. Set the ptr to DCTStatstruc.PtrPatternBufB for Beta
	 */

	u32 *buf_a;
	u32 *buf_b;
	u32 *p_A;
	u32 *p_B;
	u8 i;

	buf_a = (u32 *)(((u32)buffer + 0x10) & (0xfffffff0));
	buf_b = buf_a + 32; //??
	p_A = (u32 *)SetupDqsPattern_1PassB(pass);
	p_B = (u32 *)SetupDqsPattern_1PassA(pass);

	for(i=0;i<16;i++) {
		buf_a[i] = p_A[i];
		buf_b[i] = p_B[i];
	}

	pDCTstat->PtrPatternBufA = (u32)buf_a;
	pDCTstat->PtrPatternBufB = (u32)buf_b;
}


void mct_TrainRcvrEn_D(struct MCTStatStruc *pMCTstat,
			struct DCTStatStruc *pDCTstat, u8 Pass)
{
	if(mct_checkNumberOfDqsRcvEn_1Pass(Pass))
		dqsTrainRcvrEn_SW(pMCTstat, pDCTstat, Pass);
}


static void dqsTrainRcvrEn_SW(struct MCTStatStruc *pMCTstat,
				struct DCTStatStruc *pDCTstat, u8 Pass)
{
	u8 Channel, RcvrEnDly, RcvrEnDlyRmin;
	u8 Test0, Test1, CurrTest, CurrTestSide0, CurrTestSide1;
	u8 CTLRMaxDelay, _2Ranks, PatternA, PatternB;
	u8 Addl_Index = 0;
	u8 Receiver;
	u8 _DisableDramECC = 0, _Wrap32Dis = 0, _SSE2 = 0;
	u8 RcvrEnDlyLimit, Final_Value, MaxDelay_CH[2];
	u32 TestAddr0, TestAddr1, TestAddr0B, TestAddr1B;
	u32 PatternBuffer[64+4]; /* FIXME: need increase 8? */
	u32 Errors;

	u32 val;
	u32 reg;
	u32 dev;
	u32 index_reg;
	u32 ch_start, ch_end, ch;
	u32 msr;
	u32 cr4;
	u32 lo, hi;

	u8 valid;
	u32 tmp;
	u8 LastTest;

	print_debug_dqs("\nTrainRcvEn: Node", pDCTstat->Node_ID, 0);
	print_debug_dqs("TrainRcvEn: Pass", Pass, 0);


	dev = pDCTstat->dev_dct;
	ch_start = 0;
	if(!pDCTstat->GangedMode) {
		ch_end = 2;
	} else {
		ch_end = 1;
	}

	for (ch = ch_start; ch < ch_end; ch++) {
		reg = 0x78 + (0x100 * ch);
		val = Get_NB32(dev, reg);
		val &= ~(0x3ff << 22);
		val |= (0x0c8 << 22);		/* Max Rd Lat */
		Set_NB32(dev, reg, val);
	}

	Final_Value = 1;
	if (Pass == FirstPass) {
		mct_InitDQSPos4RcvrEn_D(pMCTstat, pDCTstat);
	} else {
		pDCTstat->DimmTrainFail = 0;
		pDCTstat->CSTrainFail = ~pDCTstat->CSPresent;
	}
	print_t("TrainRcvrEn: 1\n");

	cr4 = read_cr4();
	if(cr4 & ( 1 << 9)) {	/* save the old value */
		_SSE2 = 1;
	}
	cr4 |= (1 << 9);	/* OSFXSR enable SSE2 */
	write_cr4(cr4);
	print_t("TrainRcvrEn: 2\n");

	msr = HWCR;
	_RDMSR(msr, &lo, &hi);
	//FIXME: Why use SSEDIS
	if(lo & (1 << 17)) {	/* save the old value */
		_Wrap32Dis = 1;
	}
	lo |= (1 << 17);	/* HWCR.wrap32dis */
	lo &= ~(1 << 15);	/* SSEDIS */
	_WRMSR(msr, lo, hi);	/* Setting wrap32dis allows 64-bit memory references in real mode */
	print_t("TrainRcvrEn: 3\n");

	_DisableDramECC = mct_DisableDimmEccEn_D(pMCTstat, pDCTstat);


	if(pDCTstat->Speed == 1) {
		pDCTstat->T1000 = 5000;	/* get the T1000 figure (cycle time (ns)*1K */
	} else if(pDCTstat->Speed == 2) {
		pDCTstat->T1000 = 3759;
	} else if(pDCTstat->Speed == 3) {
		pDCTstat->T1000 = 3003;
	} else if(pDCTstat->Speed == 4) {
		pDCTstat->T1000 = 2500;
	} else if(pDCTstat->Speed  == 5) {
		pDCTstat->T1000 = 1876;
	} else {
		pDCTstat->T1000 = 0;
	}

	SetupRcvrPattern(pMCTstat, pDCTstat, PatternBuffer, Pass);
	print_t("TrainRcvrEn: 4\n");

	Errors = 0;
	dev = pDCTstat->dev_dct;
	CTLRMaxDelay = 0;

	for (Channel = 0; Channel < 2; Channel++) {
		print_debug_dqs("\tTrainRcvEn51: Node ", pDCTstat->Node_ID, 1);
		print_debug_dqs("\tTrainRcvEn51: Channel ", Channel, 1);
		pDCTstat->Channel = Channel;

		MaxDelay_CH[Channel] = 0;
		index_reg = 0x98 + 0x100 * Channel;

		Receiver = mct_InitReceiver_D(pDCTstat, Channel);
		/* There are four receiver pairs, loosely associated with chipselects. */
		for (; Receiver < 8; Receiver += 2) {
			Addl_Index = (Receiver >> 1) * 3 + 0x10;
			LastTest = DQS_FAIL;

			/* mct_ModifyIndex_D */
			RcvrEnDlyRmin = RcvrEnDlyLimit = 0xff;

			print_debug_dqs("\t\tTrainRcvEnd52: index ", Addl_Index, 2);

			if(!mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, Channel, Receiver)) {
				print_t("\t\t\tRank not enabled_D\n");
				continue;
			}

			TestAddr0 = mct_GetRcvrSysAddr_D(pMCTstat, pDCTstat, Channel, Receiver, &valid);
			if(!valid) {	/* Address not supported on current CS */
				print_t("\t\t\tAddress not supported on current CS\n");
				continue;
			}

			TestAddr0B = TestAddr0 + (BigPagex8_RJ8 << 3);

			if(mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, Channel, Receiver+1)) {
				TestAddr1 = mct_GetRcvrSysAddr_D(pMCTstat, pDCTstat, Channel, Receiver+1, &valid);
				if(!valid) {	/* Address not supported on current CS */
					print_t("\t\t\tAddress not supported on current CS+1\n");
					continue;
				}
				TestAddr1B = TestAddr1 + (BigPagex8_RJ8 << 3);
				_2Ranks = 1;
			} else {
				_2Ranks = TestAddr1 = TestAddr1B = 0;
			}

			print_debug_dqs("\t\tTrainRcvEn53: TestAddr0 ", TestAddr0, 2);
			print_debug_dqs("\t\tTrainRcvEn53: TestAddr0B ", TestAddr0B, 2);
			print_debug_dqs("\t\tTrainRcvEn53: TestAddr1 ", TestAddr1, 2);
			print_debug_dqs("\t\tTrainRcvEn53: TestAddr1B ", TestAddr1B, 2);

			/*
			 * Get starting RcvrEnDly value
			 */
			RcvrEnDly = mct_Get_Start_RcvrEnDly_1Pass(Pass);

			/* mct_GetInitFlag_D*/
			if (Pass == FirstPass) {
				pDCTstat->DqsRcvEn_Pass = 0;
			} else {
				pDCTstat->DqsRcvEn_Pass=0xFF;
			}
			pDCTstat->DqsRcvEn_Saved = 0;


			while(RcvrEnDly < RcvrEnDlyLimit) {	/* sweep Delay value here */
				print_debug_dqs("\t\t\tTrainRcvEn541: RcvrEnDly ", RcvrEnDly, 3);

				/* callback not required
				if(mct_AdjustDelay_D(pDCTstat, RcvrEnDly))
					goto skipDly;
				*/

				/* Odd steps get another pattern such that even
				 and odd steps alternate. The pointers to the
				 patterns will be swaped at the end of the loop
				 so that they correspond. */
				if(RcvrEnDly & 1) {
					PatternA = 1;
					PatternB = 0;
				} else {
					/* Even step */
					PatternA = 0;
					PatternB = 1;
				}

				mct_Write1LTestPattern_D(pMCTstat, pDCTstat, TestAddr0, PatternA); /* rank 0 of DIMM, testpattern 0 */
				mct_Write1LTestPattern_D(pMCTstat, pDCTstat, TestAddr0B, PatternB); /* rank 0 of DIMM, testpattern 1 */
				if(_2Ranks) {
					mct_Write1LTestPattern_D(pMCTstat, pDCTstat, TestAddr1, PatternA); /*rank 1 of DIMM, testpattern 0 */
					mct_Write1LTestPattern_D(pMCTstat, pDCTstat, TestAddr1B, PatternB); /*rank 1 of DIMM, testpattern 1 */
				}

				mct_SetRcvrEnDly_D(pDCTstat, RcvrEnDly, 0, Channel, Receiver, dev, index_reg, Addl_Index, Pass);

				CurrTest = DQS_FAIL;
				CurrTestSide0 = DQS_FAIL;
				CurrTestSide1 = DQS_FAIL;

				mct_Read1LTestPattern_D(pMCTstat, pDCTstat, TestAddr0);	/*cache fills */
				Test0 = mct_CompareTestPatternQW0_D(pMCTstat, pDCTstat, TestAddr0, Channel, PatternA, Pass);/* ROM vs cache compare */
				proc_IOCLFLUSH_D(TestAddr0);
				ResetDCTWrPtr_D(dev, index_reg, Addl_Index);

				print_debug_dqs("\t\t\tTrainRcvEn542: Test0 result ", Test0, 3);

				// != 0x00 mean pass

				if(Test0 == DQS_PASS) {
					mct_Read1LTestPattern_D(pMCTstat, pDCTstat, TestAddr0B);	/*cache fills */
					/* ROM vs cache compare */
					Test1 = mct_CompareTestPatternQW0_D(pMCTstat, pDCTstat, TestAddr0B, Channel, PatternB, Pass);
					proc_IOCLFLUSH_D(TestAddr0B);
					ResetDCTWrPtr_D(dev, index_reg, Addl_Index);

					print_debug_dqs("\t\t\tTrainRcvEn543: Test1 result ", Test1, 3);

					if(Test1 == DQS_PASS) {
						CurrTestSide0 = DQS_PASS;
					}
				}
				if(_2Ranks) {
					mct_Read1LTestPattern_D(pMCTstat, pDCTstat, TestAddr1);	/*cache fills */
					/* ROM vs cache compare */
					Test0 = mct_CompareTestPatternQW0_D(pMCTstat, pDCTstat, TestAddr1, Channel, PatternA, Pass);
					proc_IOCLFLUSH_D(TestAddr1);
					ResetDCTWrPtr_D(dev, index_reg, Addl_Index);

					print_debug_dqs("\t\t\tTrainRcvEn544: Test0 result ", Test0, 3);

					if(Test0 == DQS_PASS) {
						mct_Read1LTestPattern_D(pMCTstat, pDCTstat, TestAddr1B);	/*cache fills */
						/* ROM vs cache compare */
						Test1 = mct_CompareTestPatternQW0_D(pMCTstat, pDCTstat, TestAddr1B, Channel, PatternB, Pass);
						proc_IOCLFLUSH_D(TestAddr1B);
						ResetDCTWrPtr_D(dev, index_reg, Addl_Index);

						print_debug_dqs("\t\t\tTrainRcvEn545: Test1 result ", Test1, 3);
						if(Test1 == DQS_PASS) {
							CurrTestSide1 = DQS_PASS;
						}
					}
				}

				if(_2Ranks) {
					if ((CurrTestSide0 == DQS_PASS) && (CurrTestSide1 == DQS_PASS)) {
						CurrTest = DQS_PASS;
					}
				} else if (CurrTestSide0 == DQS_PASS) {
					CurrTest = DQS_PASS;
				}


				/* record first pass DqsRcvEn to stack */
				valid = mct_SavePassRcvEnDly_D(pDCTstat, RcvrEnDly, Channel, Receiver, Pass);

				/* Break(1:RevF,2:DR) or not(0) FIXME: This comment deosn't make sense */
				if(valid == 2 || (LastTest == DQS_FAIL && valid == 1)) {
					RcvrEnDlyRmin = RcvrEnDly;
					break;
				}

				LastTest = CurrTest;

				/* swap the rank 0 pointers */
				tmp = TestAddr0;
				TestAddr0 = TestAddr0B;
				TestAddr0B = tmp;

				/* swap the rank 1 pointers */
				tmp = TestAddr1;
				TestAddr1 = TestAddr1B;
				TestAddr1B = tmp;

				print_debug_dqs("\t\t\tTrainRcvEn56: RcvrEnDly ", RcvrEnDly, 3);

				RcvrEnDly++;

			}	/* while RcvrEnDly */

			print_debug_dqs("\t\tTrainRcvEn61: RcvrEnDly ", RcvrEnDly, 2);
			print_debug_dqs("\t\tTrainRcvEn61: RcvrEnDlyRmin ", RcvrEnDlyRmin, 3);
			print_debug_dqs("\t\tTrainRcvEn61: RcvrEnDlyLimit ", RcvrEnDlyLimit, 3);
			if(RcvrEnDlyRmin == RcvrEnDlyLimit) {
				/* no passing window */
				pDCTstat->ErrStatus |= 1 << SB_NORCVREN;
				Errors |= 1 << SB_NORCVREN;
				pDCTstat->ErrCode = SC_FatalErr;
			}

			if(RcvrEnDly > (RcvrEnDlyLimit - 1)) {
				/* passing window too narrow, too far delayed*/
				pDCTstat->ErrStatus |= 1 << SB_SmallRCVR;
				Errors |= 1 << SB_SmallRCVR;
				pDCTstat->ErrCode = SC_FatalErr;
				RcvrEnDly = RcvrEnDlyLimit - 1;
				pDCTstat->CSTrainFail |= 1 << Receiver;
				pDCTstat->DimmTrainFail |= 1 << (Receiver + Channel);
			}

			// CHB_D0_B0_RCVRDLY set in mct_Average_RcvrEnDly_Pass
			mct_Average_RcvrEnDly_Pass(pDCTstat, RcvrEnDly, RcvrEnDlyLimit, Channel, Receiver, Pass);

			mct_SetFinalRcvrEnDly_D(pDCTstat, RcvrEnDly, Final_Value, Channel, Receiver, dev, index_reg, Addl_Index, Pass);

			if(pDCTstat->ErrStatus & (1 << SB_SmallRCVR)) {
				Errors |= 1 << SB_SmallRCVR;
			}

			RcvrEnDly += Pass1MemClkDly;
			if(RcvrEnDly > CTLRMaxDelay) {
				CTLRMaxDelay = RcvrEnDly;
			}

		}	/* while Receiver */

		MaxDelay_CH[Channel] = CTLRMaxDelay;
	}	/* for Channel */

	CTLRMaxDelay = MaxDelay_CH[0];
	if (MaxDelay_CH[1] > CTLRMaxDelay)
		CTLRMaxDelay = MaxDelay_CH[1];

	for (Channel = 0; Channel < 2; Channel++) {
		mct_SetMaxLatency_D(pDCTstat, Channel, CTLRMaxDelay); /* program Ch A/B MaxAsyncLat to correspond with max delay */
	}

	ResetDCTWrPtr_D(dev, index_reg, Addl_Index);

	if(_DisableDramECC) {
		mct_EnableDimmEccEn_D(pMCTstat, pDCTstat, _DisableDramECC);
	}

	if (Pass == FirstPass) {
		/*Disable DQSRcvrEn training mode */
		print_t("TrainRcvrEn: mct_DisableDQSRcvEn_D\n");
		mct_DisableDQSRcvEn_D(pDCTstat);
	}

	if(!_Wrap32Dis) {
		msr = HWCR;
		_RDMSR(msr, &lo, &hi);
		lo &= ~(1<<17);		/* restore HWCR.wrap32dis */
		_WRMSR(msr, lo, hi);
	}
	if(!_SSE2){
		cr4 = read_cr4();
		cr4 &= ~(1<<9); 	/* restore cr4.OSFXSR */
		write_cr4(cr4);
	}

#if DQS_TRAIN_DEBUG > 0
	{
		u8 Channel;
		printk(BIOS_DEBUG, "TrainRcvrEn: CH_MaxRdLat:\n");
		for(Channel = 0; Channel<2; Channel++) {
			printk(BIOS_DEBUG, "Channel: %02x: %02x\n", Channel, pDCTstat->CH_MaxRdLat[Channel]);
		}
	}
#endif

#if DQS_TRAIN_DEBUG > 0
	{
		u8 val;
		u8 Channel, Receiver;
		u8 i;
		u8 *p;

		printk(BIOS_DEBUG, "TrainRcvrEn: CH_D_B_RCVRDLY:\n");
		for(Channel = 0; Channel < 2; Channel++) {
			printk(BIOS_DEBUG, "Channel: %02x\n", Channel);
			for(Receiver = 0; Receiver<8; Receiver+=2) {
				printk(BIOS_DEBUG, "\t\tReceiver: %02x: ", Receiver);
				p = pDCTstat->CH_D_B_RCVRDLY[Channel][Receiver>>1];
				for (i=0;i<8; i++) {
					val  = p[i];
					printk(BIOS_DEBUG, "%02x ", val);
				}
			printk(BIOS_DEBUG, "\n");
			}
		}
	}
#endif

	print_tx("TrainRcvrEn: Status ", pDCTstat->Status);
	print_tx("TrainRcvrEn: ErrStatus ", pDCTstat->ErrStatus);
	print_tx("TrainRcvrEn: ErrCode ", pDCTstat->ErrCode);
	print_t("TrainRcvrEn: Done\n");
}


u8 mct_InitReceiver_D(struct DCTStatStruc *pDCTstat, u8 dct)
{
	if (pDCTstat->DIMMValidDCT[dct] == 0 ) {
		return 8;
	} else {
		return 0;
	}
}


static void mct_SetFinalRcvrEnDly_D(struct DCTStatStruc *pDCTstat, u8 RcvrEnDly, u8 where, u8 Channel, u8 Receiver, u32 dev, u32 index_reg, u8 Addl_Index, u8 Pass/*, u8 *p*/)
{
	/*
	 * Program final DqsRcvEnDly to additional index for DQS receiver
	 *  enabled delay
	 */
	mct_SetRcvrEnDly_D(pDCTstat, RcvrEnDly, where, Channel, Receiver, dev, index_reg, Addl_Index, Pass);
}


static void mct_DisableDQSRcvEn_D(struct DCTStatStruc *pDCTstat)
{
	u8 ch_end, ch;
	u32 reg;
	u32 dev;
	u32 val;

	dev = pDCTstat->dev_dct;
	if (pDCTstat->GangedMode) {
		ch_end = 1;
	} else {
		ch_end = 2;
	}

	for (ch=0; ch<ch_end; ch++) {
		reg = 0x78 + 0x100 * ch;
		val = Get_NB32(dev, reg);
		val &= ~(1 << DqsRcvEnTrain);
		Set_NB32(dev, reg, val);
	}
}


/* mct_ModifyIndex_D
 * Function only used once so it was inlined.
 */


/* mct_GetInitFlag_D
 * Function only used once so it was inlined.
 */


void mct_SetRcvrEnDly_D(struct DCTStatStruc *pDCTstat, u8 RcvrEnDly,
			u8 FinalValue, u8 Channel, u8 Receiver, u32 dev,
			u32 index_reg, u8 Addl_Index, u8 Pass)
{
	u32 index;
	u8 i;
	u8 *p;
	u32 val;

	if(RcvrEnDly == 0xFE) {
		/*set the boudary flag */
		pDCTstat->Status |= 1 << SB_DQSRcvLimit;
	}

	/* DimmOffset not needed for CH_D_B_RCVRDLY array */


	for(i=0; i < 8; i++) {
		if(FinalValue) {
			/*calculate dimm offset */
			p = pDCTstat->CH_D_B_RCVRDLY[Channel][Receiver >> 1];
			RcvrEnDly = p[i];
		}

		/* if flag=0, set DqsRcvEn value to reg. */
		/* get the register index from table */
		index = Table_DQSRcvEn_Offset[i >> 1];
		index += Addl_Index;	/* DIMMx DqsRcvEn byte0 */
		val = Get_NB32_index_wait(dev, index_reg, index);
		if(i & 1) {
			/* odd byte lane */
			val &= ~(0xFF << 16);
			val |= (RcvrEnDly << 16);
		} else {
			/* even byte lane */
			val &= ~0xFF;
			val |= RcvrEnDly;
		}
		Set_NB32_index_wait(dev, index_reg, index, val);
	}

}

static void mct_SetMaxLatency_D(struct DCTStatStruc *pDCTstat, u8 Channel, u8 DQSRcvEnDly)
{
	u32 dev;
	u32 reg;
	u16 SubTotal;
	u32 index_reg;
	u32 reg_off;
	u32 val;
	u32 valx;

	if(pDCTstat->GangedMode)
		Channel = 0;

	dev = pDCTstat->dev_dct;
	reg_off = 0x100 * Channel;
	index_reg = 0x98 + reg_off;

	/* Multiply the CAS Latency by two to get a number of 1/2 MEMCLKs units.*/
	val = Get_NB32(dev, 0x88 + reg_off);
	SubTotal = ((val & 0x0f) + 1) << 1;	/* SubTotal is 1/2 Memclk unit */

	/* If registered DIMMs are being used then
	 *  add 1 MEMCLK to the sub-total.
	 */
	val = Get_NB32(dev, 0x90 + reg_off);
	if(!(val & (1 << UnBuffDimm)))
		SubTotal += 2;

	/* If the address prelaunch is setup for 1/2 MEMCLKs then
	 *  add 1, else add 2 to the sub-total.
	 *  if (AddrCmdSetup || CsOdtSetup || CkeSetup) then K := K + 2;
	 */
	val = Get_NB32_index_wait(dev, index_reg, 0x04);
	if(!(val & 0x00202020))
		SubTotal += 1;
	else
		SubTotal += 2;

	/* If the F2x[1, 0]78[RdPtrInit] field is 4, 5, 6 or 7 MEMCLKs,
	 * then add 4, 3, 2, or 1 MEMCLKs, respectively to the sub-total. */
	val = Get_NB32(dev, 0x78 + reg_off);
	SubTotal += 8 - (val & 0x0f);

	/* Convert bits 7-5 (also referred to as the course delay) of
	 * the current (or worst case) DQS receiver enable delay to
	 * 1/2 MEMCLKs units, rounding up, and add this to the sub-total.
	 */
	SubTotal += DQSRcvEnDly >> 5;	/*BOZO-no rounding up */

	/* Add 5.5 to the sub-total. 5.5 represents part of the
	 * processor specific constant delay value in the DRAM
	 * clock domain.
	 */
	SubTotal <<= 1;		/*scale 1/2 MemClk to 1/4 MemClk */
	SubTotal += 11;		/*add 5.5 1/2MemClk */

	/* Convert the sub-total (in 1/2 MEMCLKs) to northbridge
	 * clocks (NCLKs) as follows (assuming DDR400 and assuming
	 * that no P-state or link speed changes have occurred).
	 */

	/* New formula:
	 * SubTotal *= 3*(Fn2xD4[NBFid]+4)/(3+Fn2x94[MemClkFreq])/2 */
	val = Get_NB32(dev, 0x94 + reg_off);

	/* SubTotal div 4 to scale 1/4 MemClk back to MemClk */
	val &= 7;
	if (val == 4) {
		val++;		/* adjust for DDR2-1066 */
	}
	valx = (val + 3) << 2;

	val = Get_NB32(pDCTstat->dev_nbmisc, 0xD4);
	SubTotal *= ((val & 0x1f) + 4 ) * 3;

	SubTotal /= valx;
	if (SubTotal % valx) {	/* round up */
		SubTotal++;
	}

	/* Add 5 NCLKs to the sub-total. 5 represents part of the
	 * processor specific constant value in the northbridge
	 * clock domain.
	 */
	SubTotal += 5;

	pDCTstat->CH_MaxRdLat[Channel] = SubTotal;
	if(pDCTstat->GangedMode) {
		pDCTstat->CH_MaxRdLat[1] = SubTotal;
	}

	/* Program the F2x[1, 0]78[MaxRdLatency] register with
	 * the total delay value (in NCLKs).
	 */

	reg = 0x78 + reg_off;
	val = Get_NB32(dev, reg);
	val &= ~(0x3ff << 22);
	val |= (SubTotal & 0x3ff) << 22;

	/* program MaxRdLatency to correspond with current delay */
	Set_NB32(dev, reg, val);
}


static u8 mct_SavePassRcvEnDly_D(struct DCTStatStruc *pDCTstat,
			u8 rcvrEnDly, u8 Channel,
			u8 receiver, u8 Pass)
{
	u8 i;
	u8 mask_Saved, mask_Pass;
	u8 *p;

	/* calculate dimm offset
	 * not needed for CH_D_B_RCVRDLY array
	 */

	/* cmp if there has new DqsRcvEnDly to be recorded */
	mask_Pass = pDCTstat->DqsRcvEn_Pass;

	if(Pass == SecondPass) {
		mask_Pass = ~mask_Pass;
	}

	mask_Saved = pDCTstat->DqsRcvEn_Saved;
	if(mask_Pass != mask_Saved) {

		/* find desired stack offset according to channel/dimm/byte */
		if(Pass == SecondPass) {
			// FIXME: SecondPass is never used for Barcelona p = pDCTstat->CH_D_B_RCVRDLY_1[Channel][receiver>>1];
			p = 0; // Keep the compiler happy.
		} else {
			mask_Saved &= mask_Pass;
			p = pDCTstat->CH_D_B_RCVRDLY[Channel][receiver>>1];
		}
		for(i=0; i < 8; i++) {
			/* cmp per byte lane */
			if(mask_Pass & (1 << i)) {
				if(!(mask_Saved & (1 << i))) {
					/* save RcvEnDly to stack, according to
					the related Dimm/byte lane */
					p[i] = (u8)rcvrEnDly;
					mask_Saved |= 1 << i;
				}
			}
		}
		pDCTstat->DqsRcvEn_Saved = mask_Saved;
	}
	return mct_SaveRcvEnDly_D_1Pass(pDCTstat, Pass);
}


static u8 mct_CompareTestPatternQW0_D(struct MCTStatStruc *pMCTstat,
					struct DCTStatStruc *pDCTstat,
					u32 addr, u8 channel,
					u8 pattern, u8 Pass)
{
	/* Compare only the first beat of data.  Since target addrs are cache
	 * line aligned, the Channel parameter is used to determine which
	 * cache QW to compare.
	 */

	u8 *test_buf;
	u8 i;
	u8 result;
	u8 value;


	if(Pass == FirstPass) {
		if(pattern==1) {
			test_buf = (u8 *)TestPattern1_D;
		} else {
			test_buf = (u8 *)TestPattern0_D;
		}
	} else {		// Second Pass
		test_buf = (u8 *)TestPattern2_D;
	}

	SetUpperFSbase(addr);
	addr <<= 8;

	if((pDCTstat->Status & (1<<SB_128bitmode)) && channel ) {
		addr += 8;	/* second channel */
		test_buf += 8;
	}

	print_debug_dqs_pair("\t\t\t\t\t\t  test_buf = ", (u32)test_buf, "  |  addr_lo = ", addr,  4);
	for (i=0; i<8; i++) {
		value = read32_fs(addr);
		print_debug_dqs_pair("\t\t\t\t\t\t\t\t ", test_buf[i], "  |  ", value, 4);

		if (value == test_buf[i]) {
			pDCTstat->DqsRcvEn_Pass |= (1<<i);
		} else {
			pDCTstat->DqsRcvEn_Pass &= ~(1<<i);
		}
	}

	result = DQS_FAIL;

	if (Pass == FirstPass) {
		/* if first pass, at least one byte lane pass
		 * ,then DQS_PASS=1 and will set to related reg.
		 */
		if(pDCTstat->DqsRcvEn_Pass != 0) {
			result = DQS_PASS;
		} else {
			result = DQS_FAIL;
		}

	} else {
		/* if second pass, at least one byte lane fail
		 * ,then DQS_FAIL=1 and will set to related reg.
		 */
		if(pDCTstat->DqsRcvEn_Pass != 0xFF) {
			result = DQS_FAIL;
		} else {
			result = DQS_PASS;
		}
	}

	/* if second pass, we can't find the fail until FFh,
	 * then let it fail to save the final delay
	 */
	if((Pass == SecondPass) && (pDCTstat->Status & (1 << SB_DQSRcvLimit))) {
		result = DQS_FAIL;
		pDCTstat->DqsRcvEn_Pass = 0;
	}

	/* second pass needs to be inverted
	 * FIXME? this could be inverted in the above code to start with...
	 */
	if(Pass == SecondPass) {
		if (result == DQS_PASS) {
			result = DQS_FAIL;
		} else if (result == DQS_FAIL) { /* FIXME: doesn't need to be else if */
			result = DQS_PASS;
		}
	}


	return result;
}



static void mct_InitDQSPos4RcvrEn_D(struct MCTStatStruc *pMCTstat,
				struct DCTStatStruc *pDCTstat)
{
	/* Initialize the DQS Positions in preparation for
	 * Receiver Enable Training.
	 * Write Position is 1/2 Memclock Delay
	 * Read Position is 1/2 Memclock Delay
	 */
	u8 i;
	for(i=0;i<2; i++){
		InitDQSPos4RcvrEn_D(pMCTstat, pDCTstat, i);
	}
}


static void InitDQSPos4RcvrEn_D(struct MCTStatStruc *pMCTstat,
				struct DCTStatStruc *pDCTstat, u8 Channel)
{
	/* Initialize the DQS Positions in preparation for
	 * Receiver Enable Training.
	 * Write Position is no Delay
	 * Read Position is 1/2 Memclock Delay
	 */

	u8 i, j;
	u32 dword;
	u8 dn = 2; // TODO: Rev C could be 4
	u32 dev = pDCTstat->dev_dct;
	u32 index_reg = 0x98 + 0x100 * Channel;


	// FIXME: add Cx support
	dword = 0x00000000;
	for(i=1; i<=3; i++) {
		for(j=0; j<dn; j++)
			/* DIMM0 Write Data Timing Low */
			/* DIMM0 Write ECC Timing */
			Set_NB32_index_wait(dev, index_reg, i + 0x100 * j, dword);
	}

	/* errata #180 */
	dword = 0x2f2f2f2f;
	for(i=5; i<=6; i++) {
		for(j=0; j<dn; j++)
			/* DIMM0 Read DQS Timing Control Low */
			Set_NB32_index_wait(dev, index_reg, i + 0x100 * j, dword);
	}

	dword = 0x0000002f;
	for(j=0; j<dn; j++)
		/* DIMM0 Read DQS ECC Timing Control */
		Set_NB32_index_wait(dev, index_reg, 7 + 0x100 * j, dword);
}


void SetEccDQSRcvrEn_D(struct DCTStatStruc *pDCTstat, u8 Channel)
{
	u32 dev;
	u32 index_reg;
	u32 index;
	u8 ChipSel;
	u8 *p;
	u32 val;

	dev = pDCTstat->dev_dct;
	index_reg = 0x98 + Channel * 0x100;
	index = 0x12;
	p = pDCTstat->CH_D_BC_RCVRDLY[Channel];
	print_debug_dqs("\t\tSetEccDQSRcvrPos: Channel ", Channel,  2);
	for(ChipSel = 0; ChipSel < MAX_CS_SUPPORTED; ChipSel += 2) {
		val = p[ChipSel>>1];
		Set_NB32_index_wait(dev, index_reg, index, val);
		print_debug_dqs_pair("\t\tSetEccDQSRcvrPos: ChipSel ",
					ChipSel, " rcvr_delay ",  val, 2);
		index += 3;
	}
}


static void CalcEccDQSRcvrEn_D(struct MCTStatStruc *pMCTstat,
				struct DCTStatStruc *pDCTstat, u8 Channel)
{
	u8 ChipSel;
	u16 EccDQSLike;
	u8 EccDQSScale;
	u32 val, val0, val1;

	EccDQSLike = pDCTstat->CH_EccDQSLike[Channel];
	EccDQSScale = pDCTstat->CH_EccDQSScale[Channel];

	for (ChipSel = 0; ChipSel < MAX_CS_SUPPORTED; ChipSel += 2) {
		if(mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, Channel, ChipSel)) {
			u8 *p;
			p = pDCTstat->CH_D_B_RCVRDLY[Channel][ChipSel>>1];

			/* DQS Delay Value of Data Bytelane
			 * most like ECC byte lane */
			val0 = p[EccDQSLike & 0x07];
			/* DQS Delay Value of Data Bytelane
			 * 2nd most like ECC byte lane */
			val1 = p[(EccDQSLike>>8) & 0x07];

			if(val0 > val1) {
				val = val0 - val1;
			} else {
				val = val1 - val0;
			}

			val *= ~EccDQSScale;
			val >>= 8; // /256

			if(val0 > val1) {
				val -= val1;
			} else {
				val += val0;
			}

			pDCTstat->CH_D_BC_RCVRDLY[Channel][ChipSel>>1] = val;
		}
	}
	SetEccDQSRcvrEn_D(pDCTstat, Channel);
}

void mctSetEccDQSRcvrEn_D(struct MCTStatStruc *pMCTstat,
			struct DCTStatStruc *pDCTstatA)
{
	u8 Node;
	u8 i;

	for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) {
		struct DCTStatStruc *pDCTstat;
		pDCTstat = pDCTstatA + Node;
		if (!pDCTstat->NodePresent)
			break;
		if (pDCTstat->DCTSysLimit) {
			for(i=0; i<2; i++)
				CalcEccDQSRcvrEn_D(pMCTstat, pDCTstat, i);
		}
	}
}


void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat,
			struct DCTStatStruc *pDCTstatA)
{

	u8 Node = 0;
	struct DCTStatStruc *pDCTstat;

	// FIXME: skip for Ax
	while (Node < MAX_NODES_SUPPORTED) {
		pDCTstat = pDCTstatA + Node;

		if(pDCTstat->DCTSysLimit) {
			fenceDynTraining_D(pMCTstat, pDCTstat, 0);
			fenceDynTraining_D(pMCTstat, pDCTstat, 1);
		}
		Node++;
	}
}


static void fenceDynTraining_D(struct MCTStatStruc *pMCTstat,
			struct DCTStatStruc *pDCTstat, u8 dct)
{
	u16 avRecValue;
	u32 val;
	u32 dev;
	u32 index_reg = 0x98 + 0x100 * dct;
	u32 index;

	/* BIOS first programs a seed value to the phase recovery engine
	 *  (recommended 19) registers.
	 * Dram Phase Recovery Control Register (F2x[1,0]9C_x[51:50] and
	 * F2x[1,0]9C_x52.) .
	 */

	dev = pDCTstat->dev_dct;
	for (index = 0x50; index <= 0x52; index ++) {
		val = (FenceTrnFinDlySeed & 0x1F);
		if (index != 0x52) {
			val |= val << 8 | val << 16 | val << 24;
		}
		Set_NB32_index_wait(dev, index_reg, index, val);
	}


	/* Set F2x[1,0]9C_x08[PhyFenceTrEn]=1. */
	val = Get_NB32_index_wait(dev, index_reg, 0x08);
	val |= 1 << PhyFenceTrEn;
	Set_NB32_index_wait(dev, index_reg, 0x08, val);

	/* Wait 200 MEMCLKs. */
	mct_Wait(50000);		/* wait 200us */

	/* Clear F2x[1,0]9C_x08[PhyFenceTrEn]=0. */
	val = Get_NB32_index_wait(dev, index_reg, 0x08);
	val &= ~(1 << PhyFenceTrEn);
	Set_NB32_index_wait(dev, index_reg, 0x08, val);

	/* BIOS reads the phase recovery engine registers
	 * F2x[1,0]9C_x[51:50] and F2x[1,0]9C_x52. */
	avRecValue = 0;
	for (index = 0x50; index <= 0x52; index ++) {
		val = Get_NB32_index_wait(dev, index_reg, index);
		avRecValue += val & 0x7F;
		if (index != 0x52) {
			avRecValue += (val >> 8) & 0x7F;
			avRecValue += (val >> 16) & 0x7F;
			avRecValue += (val >> 24) & 0x7F;
		}
	}

	val = avRecValue / 9;
	if (avRecValue % 9)
		val++;
	avRecValue = val;

	/* Write the (averaged value -8) to F2x[1,0]9C_x0C[PhyFence]. */
	avRecValue -= 8;
	val = Get_NB32_index_wait(dev, index_reg, 0x0C);
	val &= ~(0x1F << 16);
	val |= (avRecValue & 0x1F) << 16;
	Set_NB32_index_wait(dev, index_reg, 0x0C, val);

	/* Rewrite F2x[1,0]9C_x04-DRAM Address/Command Timing Control Register
	 * delays (both channels). */
	val = Get_NB32_index_wait(dev, index_reg, 0x04);
	Set_NB32_index_wait(dev, index_reg, 0x04, val);
}


void mct_Wait(u32 cycles)
{
	u32 saved;
	u32 hi, lo, msr;

	/* Wait # of 50ns cycles
	   This seems like a hack to me...  */

	cycles <<= 3;		/* x8 (number of 1.25ns ticks) */

	msr = 0x10;			/* TSC */
	_RDMSR(msr, &lo, &hi);
	saved = lo;
	do {
		_RDMSR(msr, &lo, &hi);
	} while (lo - saved < cycles );
}