/* SPDX-License-Identifier: GPL-2.0-only */
/* Early initialization code for aarch64 (a.k.a. armv8) */

#include <arch/asm.h>
#include <soc/addressmap.h>

	.arch		armv8-a+fp


ENTRY(_start)

    .org 0
     /**
     * According to the reference manual the first instruction is fetched from
     * offset 0x100, but at offset 0 a branch instruction is always placed.
     * Support two entry points for now.
     * To save memory put the cavium specific init code between those to entry
     * points.
     */
    ic      ialluis
    fmov    d30, x0     /* Save X0 in FPR for use later */
     /**
      * The BDK stores X1 for later use, but it turns out that we don't need
      * this "feature". The idea is to hide the devicetree somewhere in
      * flash, that only the ROM will find it and point to it using X1.
      */
    adr     x1, _start  /* x1 = _start location based on PC */
    fmov    d29, x1     /* Save PC in FPR for use later */

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
    /* Change the core to big endian mode for EL3 */
    mrs     x0, SCTLR_EL3
    mov     x1, 1<<25       /* Set SCTLR_EL3[ee]=1 */
    orr     x0, x0, x1
    msr     SCTLR_EL3, x0
    #define ENDIAN_CONVERT64(reg) rev reg, reg
    #define ENDIAN_CONVERT32(reg) rev reg, reg
    #define ENDIAN_CONVERT16(reg) rev16 reg, reg
#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
    /* Nothing needed, default is little endian */
    #define ENDIAN_CONVERT64(reg)
    #define ENDIAN_CONVERT32(reg)
    #define ENDIAN_CONVERT16(reg)
#else
    #error Unknown endianness
#endif

    mov     x0, (LMC0_PF_BAR0 >> 32)
    lsl     x0, x0, 32
    mov     x1, (LMC0_PF_BAR0 & 0xffffffff)
    orr     x0, x0, x1

    /* Test if DRAM PLL is running */
    ldr     x1, [x0, LMC0_DDR_PLL_CTL0]

    tst     x1, 0x80

    b.ne     cache_setup_done

    bl      _setup_car

cache_setup_done:

    /* Check that we're running on the node we're linked for */
    mrs     x0, MPIDR_EL1
    ubfx    x0, x0, 16, 8       /* Bits 23:16 are the physical node ID */
    mov     x1, 0x0
    cmp     x0, x1

    b.ne    _wfi

node_check_done:
    /* Get code position */
    mov     x1, 0x020000
    mov     x0, BOOTROM_OFFSET
    add     x1, x0, x1

    adr     x0, _start

    /**
     * Check if IROM has loaded the code to BOOTROM_OFFSET.
     * In case the offset is wrong, try to relocate.
     * Ideally the following code is never executed.
     * FIXME: Add region overlap check.
     */
    cmp x0, x1
    b.eq    after_relocate

relocate:
    /* Get bootblock length */
    ldr     x2, =_program
    ldr     x3, =_eprogram
    sub     x2, x2, x3
    b       copy_code

.align 7
copy_code:
    ldp     q0, q1, [x1], 32    /* Load 32 bytes */
    subs    w2, w2, 32          /* Subtract 32 from length, setting flags */
    stp     q0, q1, [x0], 32    /* Store 32 bytes */
    b.gt    copy_code           /* Repeat if length is still positive */
    dmb     sy

    /* Load the actual location we're suppose to be at */
    adr     x0, after_relocate  /* Relative address */
    adr     x1, _start          /* Relative address */
    sub     x0, x0, x1     /* This only works if _start is suppose to be zero */
    mov     x1, BOOTROM_OFFSET
    add     x0, x0, x1
    br      x0             /* Branch to relocated code */

    ic      ialluis        /* Clear the icache now that all code is correct */

after_relocate:
    /* Allow unaligned memory access as long as MMU is disabled */
    mrs     x22, s3_0_c11_c0_4
    orr     x22, x22, # (1 << 37) /* Set DCVA47 */
    msr     s3_0_c11_c0_4, x22

    bl      start

    /* Real entry point */
    .org 0x100
    b      _start
ENDPROC(_start)


ENTRY(_setup_car)
    mrs     x0, MIDR_EL1
    ubfx    x0, x0, 4, 12       /* Bits 15:4 are the part number */
    cmp     x0, 0xb0
    b.ge    _wfi

thunder1_cache_setup:
   /**
    * Setup L2 cache to allow secure access to all of the address space
    * thunder1 compatibility list:
    * - CN81XX
    * - CN83XX
    * - CN88XX
    */
    #define REGIONX_START   0x1000
    #define REGIONX_END     0x1008
    #define REGIONX_ATTR    0x1010
    mov     x0, L2C_PF_BAR0 >> 32
    lsl     x0, x0, 32
    mov     x1, (L2C_PF_BAR0 & 0xffffffff)
    orr     x0, x0, x1
    str     xzr, [x0, REGIONX_START]    /* Start of zero */
    mov     x1, 0x3fffff00000           /* End of max address */
    ENDIAN_CONVERT64(x1)
    str     x1, [x0, REGIONX_END]
    mov     x1, 2                       /* Secure only access */
    ENDIAN_CONVERT64(x1)
    str     x1, [x0, REGIONX_ATTR]
    /* Update way partition to allow core 0 to write to L2 */
    #define L2C_WPAR_PP0_OFFSET 0x40000
    mov     x1, L2C_WPAR_PP0_OFFSET
    str     xzr, [x0, x1]
    ldr     xzr, [x0, x1]       /* Read back to make sure done */
    #undef REGIONX_START
    #undef REGIONX_END
    #undef REGIONX_ATTR
    #undef L2C_WPAR_PP0_OFFSET

    /**
     * At this point the whole CAR is readable and writeable, but if
     * we touch to many cache-lines our code might get flushed out.
     * We have to lock all cache-lines that are to be used as RAM, which are
     * the ones marked as SRAM in memlayout.
     */
    mrs     x0, CTR_EL0            /* Get cache-line size */
    /* [19:16] - Indicates (Log2(number of words in cache line) */
    ubfx    x0, x0, 16, 4
    mov     x1, 4                  /* Bytes in a word (32-bit) */
    lsl     x0, x1, x0             /* Number of Bytes in x0 */

    sub     x1, x0, 1
    mvn     x1, x1                 /* Place mask in x1 */

    ldr     x3, =_sram
    and     x3, x3, x1             /* Align addresses with cache-lines */
    ldr     x4, =_esram
    add     x4, x4, x0
    sub     x4, x4, 1
    and     x4, x4, x1             /* Align addresses with cache-lines */
    sub     x2, x4, x3             /* Store sram length in x2 */

lock_cache_lines:
    sys     #0, c11, c1, #4, x3
    add     x3, x3, x0             /* Increment address by cache-line bytes */
    subs    w2, w2, w0             /* Subtract cache-line bytes from length */
    b.gt    lock_cache_lines       /* Repeat if length is still positive */

    /**
     * The locked region isn't considered dirty by L2. Do read/write of
     * each cache line to force each to be dirty. This is needed across the
     * whole line to make sure the L2 dirty bits are all up to date.
     * NOTE: If we'd relocate we could memset the whole memory !
     */
    ldr     x3, =_sram
    and     x3, x3, x1             /* Align addresses with cache-lines */
    ldr     x4, =_esram
    add     x4, x4, x0
    sub     x4, x4, 1
    and     x4, x4, x1             /* Align addresses with cache-lines */
    sub     x2, x4, x3             /* Store sram length in x2 */
    mov     x4, x3
    b       dirty_cache_line

.align 7
dirty_cache_line:
    ldp     q0, q1, [x3], 32    /* Load 32 bytes */
    subs    w2, w2, 32          /* Subtract 32 from length, setting flags */
    stp     q0, q1, [x4], 32    /* Store 32 bytes */
    b.gt    dirty_cache_line    /* Repeat if length is still positive */
    dmb     sy

clear_interrupts:
    /**
     * As the memory controller isn't running, but we access the DRAM's
     * address space, some interrupt flags had been set.
     * Tidy up our mess now on (valid for CN81XX only).
     */
    mov     x0, (L2C_TAD0_INT_W1C >> 32)
    lsl     x0, x0, 32
    mov     x1, (L2C_TAD0_INT_W1C & 0xffffffff)
    orr     x0, x0, x1

    ldr x1, [x0]
    orr x1, x1, 0x1c00 /* Clear WRDISLMC, RDDISLMC, RDNXM */
    str x1, [x0]

    ret
ENDPROC(_setup_car)

ENTRY(_wfi)
    wfi
ENDPROC(_wfi)

ENTRY(start)
    bl      arm64_init_cpu

    fmov    x0, d30 /* The original X0, info from previous image */
    fmov    x1, d29 /* The original PC we were loaded at */

    /* Call C entry */
    bl      bootblock_main

ENDPROC(start)