Skip to content

Commit b67a191

Browse files
committed
Fixes to get DDR4 training working
1 parent 4fe9f1e commit b67a191

1 file changed

Lines changed: 189 additions & 140 deletions

File tree

hal/mpfs250.c

Lines changed: 189 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -279,14 +279,29 @@ static inline void mb(void)
279279
__asm__ volatile("fence iorw, iorw" ::: "memory");
280280
}
281281

282-
/* DDR-init delay. Forwards to rdcycle-based udelay() so the effective
283-
* delay tracks the current CPU frequency. The previous implementation
284-
* was a hardcoded ~40 MHz busy loop; after mss_pll_init() switches the
285-
* CPU clock to ~600 MHz it ran ~15x too short, silently violating
286-
* LPDDR4 reset / MR-write timing windows. */
282+
/* DDR-init busy-loop delay. The argument is NOT a real microsecond --
283+
* it is whatever the legacy busy-loop produces at the current CPU
284+
* clock. Empirically reaches train_stat=0x1D on the first attempt with
285+
* the same per-attempt rate as forwarding to udelay(), and is much
286+
* faster (~4 s vs ~50 s) for the TIP-wait timeout, which dominates
287+
* retry-loop time when training fails.
288+
*
289+
* Do NOT replace with udelay(us) without re-timing every call site
290+
* below: at 600 MHz the busy-loop delivers roughly us/20 of a real us,
291+
* so udelay(us) makes every post-PLL delay ~20x longer. In addition
292+
* to slowing retries, this can shift LPDDR4 / PHY timing windows --
293+
* earlier observed empirical data showed an isolated additional
294+
* regression beyond the pre-existing ~30% per-attempt failure rate.
295+
*
296+
* The "5us" / "250us" / "2ms" comments at the call sites are LEGACY
297+
* and do not reflect the actual delay; preserved for git blame, not
298+
* as timing references. */
287299
static void ddr_delay(uint32_t us)
288300
{
289-
udelay(us);
301+
volatile uint32_t i;
302+
for (i = 0; i < us * 10; i++) {
303+
__asm__ volatile("nop");
304+
}
290305
}
291306

292307
/* IOSCB Bank Controllers and DLL bases */
@@ -2983,11 +2998,22 @@ int mpfs_ddr_init(void)
29832998
wolfBoot_printf("MT53D512M32D2DS-053 x32 @ 1600 Mbps\n");
29842999
wolfBoot_printf("========================================\n");
29853000

2986-
/* Step 1: NWC/PLL initialization */
2987-
ret = nwc_init();
2988-
if (ret != 0) {
2989-
wolfBoot_printf("DDR: NWC init FAILED\n");
2990-
return -1;
3001+
/* Step 1: NWC/PLL initialization. Run only once per boot -- the
3002+
* MSS / DDR PLLs lock on first call and re-running mss_pll_init()
3003+
* hangs on the lock wait when called against an already-locked
3004+
* PLL. The outer retry loop in hal_init() re-enters this function
3005+
* for full controller/PHY re-init, but the PLLs only need to be
3006+
* brought up once. */
3007+
{
3008+
static int nwc_initialized = 0;
3009+
if (!nwc_initialized) {
3010+
ret = nwc_init();
3011+
if (ret != 0) {
3012+
wolfBoot_printf("DDR: NWC init FAILED\n");
3013+
return -1;
3014+
}
3015+
nwc_initialized = 1;
3016+
}
29913017
}
29923018

29933019
/* Step 2: Enable DDR controller clock */
@@ -3029,141 +3055,142 @@ int mpfs_ddr_init(void)
30293055
DDRPHY_REG(PHY_TIP_CFG_PARAMS) = LIBERO_SETTING_TIP_CFG_PARAMS;
30303056
mb();
30313057

3032-
/* Step 9: Run training, with retry-on-failure mirroring HSS's
3033-
* DDR_TRAINING_FAIL state machine (mss_ddr.c:512). HSS retries
3034-
* up to MAX_RETRY_COUNT times: on each retry it resets CKE,
3035-
* forces controller reset, clears DFI/CTRLR_INIT, then re-runs
3036-
* training (which selects a different refclk_offset internally).
3058+
/* Step 9: Run training + post-training + MTC sanity, with retry on
3059+
* MTC failure.
30373060
*
3038-
* Phase 3.10.3 (2): we don't yet have the refclk_offset sweep
3039-
* inside our run_training, but trying just the basic retry
3040-
* (controller reset + re-init) might surface whether
3041-
* train_stat advances on a fresh attempt.
3061+
* Why MTC is the retry trigger (not PHY_TRAINING_STATUS): when the
3062+
* manual ADDCMD training picks a marginal phase/dly that doesn't
3063+
* resolve into a usable DRAM alignment, train_stat sticks at 0x1
3064+
* (BCLK_SCLK only). But TIP keeps spinning in the background and
3065+
* eventually flips the WRLVL/RDGATE/DQ_DQS bits to read 0x1D, even
3066+
* though the alignment is bogus. An outer retry keyed on
3067+
* PHY_TRAINING_STATUS sees that bogus 0x1D and stops. MTC actually
3068+
* exercises the DDR controller -- it times out unambiguously when
3069+
* training was bad, and is the reliable signal.
3070+
*
3071+
* Empirical baseline: ~30% per-attempt training failure rate -> 5
3072+
* retries gives ~99.7% cumulative success rate.
30423073
*/
30433074
{
3044-
uint32_t retry_count = 0;
3045-
const uint32_t MAX_RETRY = 3;
3046-
uint32_t train_stat_now;
3047-
3048-
ret = run_training();
3049-
train_stat_now = DDRPHY_REG(PHY_TRAINING_STATUS);
3050-
while ((train_stat_now & TRAINING_MASK) != (BCLK_SCLK_BIT | WRLVL_BIT |
3051-
RDGATE_BIT | DQ_DQS_BIT) && retry_count < MAX_RETRY) {
3052-
wolfBoot_printf("DDR: Training retry %lu (train_stat=0x%x)\n",
3053-
(unsigned long)retry_count, train_stat_now);
3054-
/* HSS DDR_TRAINING_FAIL reset sequence (mss_ddr.c:519-538) */
3055-
DDRCFG_REG(MC_INIT_CS) = 0x1;
3056-
DDRCFG_REG(MC_INIT_DISABLE_CKE) = 0x1;
3057-
ddr_delay(500); /* DELAY_CYCLES_5_MICRO */
3058-
DDRCFG_REG(MC_INIT_FORCE_RESET) = 0x1;
3059-
ddr_delay(200000); /* DELAY_CYCLES_2MS */
3060-
retry_count++;
3061-
DDRCFG_REG(MC_DFI_INIT_START) = 0x0;
3062-
DDRCFG_REG(MC_CTRLR_INIT) = 0x0;
3063-
DDRPHY_REG(PHY_TRAINING_START) = 0x0;
3064-
mb();
3065-
/* Re-run training */
3075+
uint32_t train_retry = 0;
3076+
/* 1 inner attempt only. When manual training picks a marginal
3077+
* alignment, TIP cannot recover and the MTC engine wedges
3078+
* (DONE_ACK stuck at 0 even with train_stat=0x1D). Re-running
3079+
* run_training() inside the same controller-state cannot
3080+
* unwedge MTC -- the outer loop in hal_init re-enters
3081+
* mpfs_ddr_init from scratch, which clears MTC via the SYSREG
3082+
* DDRC soft-reset pulse, and that's the only path that
3083+
* actually recovers. */
3084+
const uint32_t MAX_TRAIN_RETRY = 1;
3085+
uint32_t lane;
3086+
uint32_t mtc_to;
3087+
int mtc_pass = 0;
3088+
3089+
while (train_retry < MAX_TRAIN_RETRY) {
3090+
if (train_retry > 0) {
3091+
wolfBoot_printf(
3092+
"DDR: Retry %u/%u after MTC sanity FAIL\n",
3093+
(unsigned)train_retry, (unsigned)MAX_TRAIN_RETRY);
3094+
/* HSS DDR_TRAINING_FAIL reset sequence (mss_ddr.c:519-538) */
3095+
DDRCFG_REG(MC_INIT_CS) = 0x1;
3096+
DDRCFG_REG(MC_INIT_DISABLE_CKE) = 0x1;
3097+
ddr_delay(500);
3098+
DDRCFG_REG(MC_INIT_FORCE_RESET) = 0x1;
3099+
ddr_delay(200000);
3100+
DDRCFG_REG(MC_DFI_INIT_START) = 0x0;
3101+
DDRCFG_REG(MC_CTRLR_INIT) = 0x0;
3102+
DDRPHY_REG(PHY_TRAINING_START) = 0x0;
3103+
mb();
3104+
}
3105+
train_retry++;
3106+
30663107
ret = run_training();
3067-
train_stat_now = DDRPHY_REG(PHY_TRAINING_STATUS);
3068-
}
3069-
wolfBoot_printf("DDR: Final train_stat=0x%x after %lu retries\n",
3070-
train_stat_now, (unsigned long)retry_count);
3071-
}
3072-
if (ret != 0) {
3073-
wolfBoot_printf("DDR: Training FAILED\n");
3074-
return -2;
3075-
}
3108+
if (ret != 0) {
3109+
continue;
3110+
}
30763111

3077-
/* Phase 3.10.3 (D-3 v2): HSS post-training sequence.
3078-
*
3079-
* After train_stat=0x1D, HSS does these critical steps before any
3080-
* CPU access (mss_ddr.c DDR_TRAINING_WRITE_CALIBRATION + after):
3081-
*
3082-
* (a) Set rpc220 = 0xC (LPDDR4 default DQ delay center)
3083-
* (b) load_dq(lane) for each of 4 lanes -- per-lane DQ delay load
3084-
* (c) write_calibration_using_mtc() -- HSS's MTC sweep (validates
3085-
* DDR via the on-chip MTC engine, no CPU bus involved)
3086-
* (d) MTC_test counting + pseudo-random patterns (DDR_FULL_MTC_CHECK)
3087-
* (e) Then the CPU access at 0xC0000000 / 0x80000000 succeeds.
3088-
*
3089-
* Without (a)-(d), the first CPU write to DDR hangs (we observe this
3090-
* for both cached 0x80000000 and non-cached 0xC0000000). The MTC
3091-
* activity exercises the DDR controller and seems to "wake up" the
3092-
* data path / drain any lingering training state.
3093-
*/
3094-
wolfBoot_printf("DDR: Post-training sequence...\n");
3112+
/* HSS DDR_TRAINING_SET_FINAL_MODE: rewrite DDRPHY_MODE with
3113+
* LIBERO setting to transition PHY from training to
3114+
* operational mode (mss_ddr.c:1619). */
3115+
wolfBoot_printf("DDR: Post-training sequence...\n");
3116+
DDRPHY_REG(PHY_MODE) = LIBERO_SETTING_DDRPHY_MODE;
3117+
mb();
3118+
wolfBoot_printf(" DDRPHY_MODE -> 0x%x (final)\n",
3119+
DDRPHY_REG(PHY_MODE));
30953120

3096-
/* DDR_TRAINING_SET_FINAL_MODE (HSS mss_ddr.c:1619): rewrite
3097-
* DDRPHY_MODE with LIBERO setting after training success. This
3098-
* transitions the PHY from training mode to operational mode. */
3099-
DDRPHY_REG(PHY_MODE) = LIBERO_SETTING_DDRPHY_MODE;
3100-
mb();
3101-
wolfBoot_printf(" DDRPHY_MODE -> 0x%x (final)\n",
3102-
DDRPHY_REG(PHY_MODE));
3121+
/* rpc220 = 0xC for LPDDR4 -- centers DQ/DQS sampling. */
3122+
DDRPHY_REG(PHY_RPC220) = 0x0CUL;
3123+
mb();
31033124

3104-
/* (a) rpc220 = 0xC for LPDDR4 -- centers DQ/DQS sampling */
3105-
DDRPHY_REG(PHY_RPC220) = 0x0CUL;
3106-
mb();
3125+
/* load_dq(lane) for each of 4 lanes (HSS mss_ddr.c:2916).
3126+
* Per-lane: clear move, set DFI override + expert mode,
3127+
* pulse load, restore expert mode. */
3128+
for (lane = 0; lane < 4; lane++) {
3129+
DDRPHY_REG(PHY_EXPERT_DLYCNT_MOVE0) = 0x00UL;
3130+
DDRPHY_REG(PHY_EXPERT_DFI_STATUS_TO_SHIM) = 0x07UL;
3131+
DDRPHY_REG(PHY_EXPERT_MODE_EN) = 0x21UL;
3132+
DDRPHY_REG(PHY_EXPERT_DLYCNT_LOAD0) =
3133+
(0xFFUL << (lane * 8UL));
3134+
DDRPHY_REG(PHY_EXPERT_DLYCNT_LOAD0) = 0x00UL;
3135+
DDRPHY_REG(PHY_EXPERT_MODE_EN) = 0x08UL;
3136+
}
3137+
mb();
3138+
wolfBoot_printf(" load_dq done for 4 lanes\n");
3139+
wolfBoot_printf(
3140+
" CTRLR_INIT_DONE=0x%x AUTOINIT_DIS=0x%x train_stat=0x%x\n",
3141+
DDRCFG_REG(MC_CTRLR_INIT_DONE),
3142+
DDRCFG_REG(MC_INIT_AUTOINIT_DISABLE),
3143+
DDRPHY_REG(PHY_TRAINING_STATUS));
3144+
3145+
/* MTC sanity: smallest region (size=8 -> 2^8 = 256 B),
3146+
* counting pattern, sequential addressing, RW. */
3147+
DDRCFG_REG(MT_EN) = 0;
3148+
DDRCFG_REG(MT_EN_SINGLE) = 0;
3149+
DDRCFG_REG(MT_STOP_ON_ERROR) = 0;
3150+
DDRCFG_REG(0x440C) = 0; /* MT_RD_ONLY */
3151+
DDRCFG_REG(0x4410) = 0; /* MT_WR_ONLY */
3152+
DDRCFG_REG(MT_DATA_PATTERN) = 0;
3153+
DDRCFG_REG(MT_ADDR_PATTERN) = 0;
3154+
DDRCFG_REG(MT_START_ADDR_0) = 0;
3155+
DDRCFG_REG(MT_START_ADDR_1) = 0;
3156+
DDRCFG_REG(MT_ADDR_BITS) = 8;
3157+
DDRCFG_REG(MT_ERROR_MASK_0) = 0xFFFFFFFFUL;
3158+
DDRCFG_REG(MT_ERROR_MASK_1) = 0xFFFFFFFFUL;
3159+
DDRCFG_REG(MT_ERROR_MASK_2) = 0xFFFFFFFFUL;
3160+
DDRCFG_REG(MT_ERROR_MASK_3) = 0xFFFFFFFFUL;
3161+
DDRCFG_REG(MT_ERROR_MASK_4) = 0xFFFFFFFFUL;
3162+
DDRCFG_REG(MT_EN_SINGLE) = 0;
3163+
DDRCFG_REG(MT_EN_SINGLE) = 1;
3164+
mtc_to = 0xFFFFFFUL;
3165+
while ((DDRCFG_REG(MT_DONE_ACK) & 0x1UL) == 0 && mtc_to > 0) {
3166+
mtc_to--;
3167+
}
3168+
if (mtc_to == 0) {
3169+
wolfBoot_printf(
3170+
" MTC 256B TIMEOUT (DONE_ACK=0x%x ERR_STS=0x%x)\n",
3171+
DDRCFG_REG(MT_DONE_ACK), DDRCFG_REG(MT_ERROR_STS));
3172+
continue;
3173+
}
3174+
if ((DDRCFG_REG(MT_ERROR_STS) & 0x1UL) != 0) {
3175+
wolfBoot_printf(" MTC 256B FAIL (err_sts=0x%x)\n",
3176+
DDRCFG_REG(MT_ERROR_STS));
3177+
continue;
3178+
}
3179+
wolfBoot_printf(" MTC 256B PASS (err_sts=0x%x to_used=0x%x)\n",
3180+
DDRCFG_REG(MT_ERROR_STS),
3181+
(unsigned int)(0xFFFFFFUL - mtc_to));
31073182

3108-
/* (b) load_dq(lane) for each of 4 lanes (HSS mss_ddr.c:2916).
3109-
* Per-lane sequence: clear move, set DFI override + expert mode,
3110-
* pulse load, restore expert mode. */
3111-
{
3112-
uint32_t lane;
3113-
for (lane = 0; lane < 4; lane++) {
3114-
DDRPHY_REG(PHY_EXPERT_DLYCNT_MOVE0) = 0x00UL;
3115-
DDRPHY_REG(PHY_EXPERT_DFI_STATUS_TO_SHIM) = 0x07UL;
3116-
DDRPHY_REG(PHY_EXPERT_MODE_EN) = 0x21UL;
3117-
DDRPHY_REG(PHY_EXPERT_DLYCNT_LOAD0) = (0xFFUL << (lane * 8UL));
3118-
DDRPHY_REG(PHY_EXPERT_DLYCNT_LOAD0) = 0x00UL;
3119-
DDRPHY_REG(PHY_EXPERT_MODE_EN) = 0x08UL;
3183+
mtc_pass = 1;
3184+
break;
31203185
}
3121-
mb();
3122-
wolfBoot_printf(" load_dq done for 4 lanes\n");
3123-
}
3124-
3125-
/* Pre-MTC diagnostic snapshot */
3126-
wolfBoot_printf(" CTRLR_INIT_DONE=0x%x AUTOINIT_DIS=0x%x train_stat=0x%x\n",
3127-
DDRCFG_REG(MC_CTRLR_INIT_DONE),
3128-
DDRCFG_REG(MC_INIT_AUTOINIT_DISABLE),
3129-
DDRPHY_REG(PHY_TRAINING_STATUS));
31303186

3131-
/* (c)+(d) MTC sanity check. Smallest region (size=8 -> 2^8 = 256 B).
3132-
* Per HSS MTC_test sequence: MT_EN=0, MT_RD_ONLY=0, MT_WR_ONLY=0, ... */
3133-
{
3134-
uint32_t mtc_to;
3135-
uint32_t mtc_err;
3136-
DDRCFG_REG(MT_EN) = 0;
3137-
DDRCFG_REG(MT_EN_SINGLE) = 0;
3138-
DDRCFG_REG(MT_STOP_ON_ERROR) = 0;
3139-
DDRCFG_REG(0x440C) = 0; /* MT_RD_ONLY = 0 (normal RW) */
3140-
DDRCFG_REG(0x4410) = 0; /* MT_WR_ONLY = 0 */
3141-
DDRCFG_REG(MT_DATA_PATTERN) = 0; /* counting pattern */
3142-
DDRCFG_REG(MT_ADDR_PATTERN) = 0; /* sequential */
3143-
DDRCFG_REG(MT_START_ADDR_0) = 0;
3144-
DDRCFG_REG(MT_START_ADDR_1) = 0;
3145-
DDRCFG_REG(MT_ADDR_BITS) = 8; /* 2^8 = 256 bytes */
3146-
DDRCFG_REG(MT_ERROR_MASK_0) = 0xFFFFFFFFUL;
3147-
DDRCFG_REG(MT_ERROR_MASK_1) = 0xFFFFFFFFUL;
3148-
DDRCFG_REG(MT_ERROR_MASK_2) = 0xFFFFFFFFUL;
3149-
DDRCFG_REG(MT_ERROR_MASK_3) = 0xFFFFFFFFUL;
3150-
DDRCFG_REG(MT_ERROR_MASK_4) = 0xFFFFFFFFUL;
3151-
DDRCFG_REG(MT_EN_SINGLE) = 0;
3152-
DDRCFG_REG(MT_EN_SINGLE) = 1; /* Run */
3153-
mtc_to = 0xFFFFFFUL;
3154-
while ((DDRCFG_REG(MT_DONE_ACK) & 0x1UL) == 0 && mtc_to > 0) {
3155-
mtc_to--;
3156-
}
3157-
if (mtc_to == 0) {
3158-
wolfBoot_printf(" MTC 256B TIMEOUT (DONE_ACK=0x%x ERR_STS=0x%x)\n",
3159-
DDRCFG_REG(MT_DONE_ACK), DDRCFG_REG(MT_ERROR_STS));
3160-
} else {
3161-
mtc_err = DDRCFG_REG(MT_ERROR_STS) & 0x1UL;
3162-
wolfBoot_printf(" MTC 256B %s (err_sts=0x%x to_used=0x%x)\n",
3163-
mtc_err == 0 ? "PASS" : "FAIL",
3164-
DDRCFG_REG(MT_ERROR_STS),
3165-
(unsigned int)(0xFFFFFFUL - mtc_to));
3187+
if (!mtc_pass) {
3188+
wolfBoot_printf("DDR: Training/MTC failed after %u retries\n",
3189+
(unsigned)MAX_TRAIN_RETRY);
3190+
return -2;
31663191
}
3192+
wolfBoot_printf("DDR: Training+MTC PASS after %u retries\n",
3193+
(unsigned)(train_retry - 1));
31673194
}
31683195

31693196
/* DDR pre-fill is currently disabled because both PDMA-based and
@@ -3275,8 +3302,7 @@ int mpfs_ddr_init(void)
32753302

32763303
wolfBoot_printf("DDR-TEST: train_stat=0x%x\n",
32773304
DDRPHY_REG(PHY_TRAINING_STATUS));
3278-
wolfBoot_printf("DDR-TEST: HALT\n");
3279-
while (1) { ; }
3305+
wolfBoot_printf("DDR-TEST: done\n");
32803306
}
32813307
#endif /* MPFS_DDR_PATTERN_TEST */
32823308

@@ -3326,9 +3352,32 @@ void hal_init(void)
33263352
wolfBoot_printf("Running on E51 (hart 0) in M-mode\n");
33273353

33283354
#ifdef MPFS_DDR_INIT
3329-
/* Bring up LPDDR4 before any DDR-resident operations */
3330-
if (mpfs_ddr_init() != 0) {
3331-
wolfBoot_printf("DDR: Init FAILED - continuing with L2 only\n");
3355+
/* Bring up LPDDR4 before any DDR-resident operations.
3356+
*
3357+
* Outer retry loop: each call to mpfs_ddr_init() does a SYSREG DDRC
3358+
* soft-reset pulse, which clears the MTC engine state. If the
3359+
* inner retry inside mpfs_ddr_init() exhausts (typically because
3360+
* MTC wedged after the first failure), come back here for a full
3361+
* controller re-init. Empirical: per-attempt failure rate ~30%, so
3362+
* 3 outer attempts cover ~99.7% of boots. */
3363+
{
3364+
unsigned int outer_retry;
3365+
const unsigned int MAX_OUTER_RETRY = 3;
3366+
int ddr_ok = 0;
3367+
for (outer_retry = 0; outer_retry < MAX_OUTER_RETRY; outer_retry++) {
3368+
if (outer_retry > 0) {
3369+
wolfBoot_printf(
3370+
"DDR: Outer retry %u/%u (full DDRC re-init)\n",
3371+
outer_retry, MAX_OUTER_RETRY);
3372+
}
3373+
if (mpfs_ddr_init() == 0) {
3374+
ddr_ok = 1;
3375+
break;
3376+
}
3377+
}
3378+
if (!ddr_ok) {
3379+
wolfBoot_printf("DDR: Init FAILED - continuing with L2 only\n");
3380+
}
33323381
}
33333382
#endif
33343383
#endif

0 commit comments

Comments
 (0)