Starting the Program of Go Runtime

Compile the following code

package main

func main() {
    println("Hello Go")
}

Find the entry point of the Go executable#

Use readelf to view the entry address of the executable binary, and use the nm tool of the Go toolchain to find the function name corresponding to the address in the symbol table.

$ readelf -h ./hello
ELF Header:
...
  Entry point address: 0x454020
...
$ go tool nm ./hello| grep 454020
  454020 T _rt0_amd64_linux

Startup Phase#

In the Go source code, you can find the file where _rt0_amd64_linux is located. The work done at the start of the program is written in the assembly code corresponding to the platform, mainly in two files: runtime/rt0_linux_amd64.s and runtime/asm_amd64.s.

TEXT _rt0_amd64_linux(SB),NOSPLIT,$-8
    JMP _rt0_amd64(SB)

TEXT _rt0_amd64(SB),NOSPLIT,$-8
    MOVQ    0(SP), DI   // argc
    JMP runtime·rt0_go(SB)

TEXT runtime·rt0_go(SB),NOSPLIT|NOFRAME|TOPFRAME,$0
    // ...

    // Set up the stack space for g0
    MOVQ    $runtime·g0(SB), DI
    LEAQ    (-64*1024)(SP), BX
    MOVQ    BX, g_stackguard0(DI)
    MOVQ    BX, g_stackguard1(DI)
    MOVQ    BX, (g_stack+stack_lo)(DI)
    MOVQ    SP, (g_stack+stack_hi)(DI)

    // Use arch_prctl(ARCH_SET_FS) to set the TLS base address of the m0 thread to m0.tls
    LEAQ    runtime·m0+m_tls(SB), DI
    CALL    runtime·settls(SB)

    // Perform basic correctness and safety checks
    // Including: size of basic types, size of platform pointer types, CAS correctness checks
    CALL    runtime·check(SB)

    MOVL    24(SP), AX        // copy argc
    MOVL    AX, 0(SP)
    MOVQ    32(SP), AX        // copy argv
    MOVQ    AX, 8(SP)
    CALL    runtime·args(SB)
    CALL    runtime·osinit(SB)
    CALL    runtime·schedinit(SB)

    // The address of the runtime.main function is passed to newproc
    // This adds runtime.main to the run queue of a p
    MOVQ    $runtime·mainPC(SB), AX        // entry
    PUSHQ    AX
    CALL    runtime·newproc(SB)
    POPQ    AX

    // The main thread executes the schedule scheduling loop
    // runtime.main will be scheduled for execution
    // runtime.main will internally call main.main 
    CALL    runtime·mstart(SB)

    // mstart should not return, so an error is reported directly here
    CALL    runtime·abort(SB)
    RET

TEXT runtime·mstart(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
    CALL    runtime·mstart0(SB)
    RET // not reached

runtime.args#

// runtime/runtime1.go
func args(c int32, v **byte) {
    argc = c
    argv = v
    sysargs(c, v)
}

This function mainly sets the two global variables argc and argv, and in sysargs, it retrieves/sets by reading argv:

startupRandomData a 16-byte random data buffer set by the kernel (ld-linux.so)
physPageSize the system's physical page size
Reads the ELF header to obtain string tables, symbol tables, dynamic linking, and vdso information
On Linux, it reads the symbol table to set two special pointers for vdso calls

var vdsoSymbolKeys = []vdsoSymbolKey{
    {"__vdso_gettimeofday", 0x315ca59, 0xb01bca00, &vdsoGettimeofdaySym},
    {"__vdso_clock_gettime", 0xd35ec75, 0x6e43a318, &vdsoClockgettimeSym},
}

runtime.osinit#

// runtime/os_linux.go
func osinit() {
    ncpu = getproccount()
    physHugePageSize = getHugePageSize()
    if iscgo {
        // ... handle cgo signal related
    }
    osArchInit()
}

The osinit in Linux is relatively simple:

Obtains processor information through sched_getaffinity
Obtains the physical size of transparent huge pages through /sys/kernel/mm/transparent_hugepage/hpage_pmd_size

runtime.schedinit#

Initialization work for the Go coroutine scheduler, only key parts of the code are taken here, and some temporarily unimplemented empty function calls are ignored.

// runtime/proc.go
func schedinit() {
    gp := getg()
    sched.maxmcount = 10000
    moduledataverify()      // module data verification
    stackinit()             // coroutine stack memory pool initialization
    mallocinit()            // memory allocator initialization
    alginit()               // AES algorithm hardware support initialization
    fastrandinit()          // initialize random seed, using the previous startupRandomData
    mcommoninit(gp.m, -1)   // initialize thread increment ID signal handling coroutine and fast random seed
    modulesinit()           // read each module to initialize GC scanning global variable sizes
    typelinksinit()         // read each module to collect type link information
    itabsinit()             // initialize itab table based on typelink
    stkobjinit()            // stack initialization for GC related

    sigsave(&gp.m.sigmask)  // save thread signal mask

    goargs()                // set global variable argslice
    goenvs()                // set global variable envs
    parsedebugvars()        // read GODEBUG environment variable to set trace
    gcinit()                // GC initialization

    // Initialize allp based on the number of physical processors or GOMAXPROCS
    procs := ncpu
    if n, ok := atoi32(gogetenv("GOMAXPROCS")); ok && n > 0 {
        procs = n
    }
    if procresize(procs) != nil {
        throw("unknown runnable goroutine during bootstrap")
    }
}

runtime.getg#

The runtime.getg function will be compiled into a single assembly during the compilation process, which retrieves the TLS base address. This base address is set to the current coroutine data pointer *g when the coroutine starts through runtime.gogo.

// ir.OGETG -> ssa.OpGetG -> ssa.OpAMD64LoweredGetG -> MOVQ (TLS), r

TEXT runtime·gogo(SB), NOSPLIT, $0-8
    // ...
    JMP    gogo<>(SB)

TEXT gogo<>(SB), NOSPLIT, $0
    get_tls(CX)
    MOVQ    DX, g(CX)

runtime.procresize#

// runtime/proc.go
func procresize(nprocs int32) *p {
    // The first half mainly modifies:
    // 1. allp []*p
    // 2. idlepMask idle P mask, each bit indicates whether the corresponding index in allp is idle
    // 3. timerpMask mask for P that may have timers, each bit indicates whether the corresponding index in allp has a timer

    // When expanding the number of procs, initialize the newly created P
    // At the beginning of the program, allp is empty, and here all P will be created and initialized
    for i := old; i < nprocs; i++ {
        pp := allp[i]
        if pp == nil {
            pp = new(p)
        }
        pp.init(i)
        atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(pp))
    }

    // When subsequently shrinking P, excess P will be destroyed
    // So here we first ensure that the current P is not the one to be destroyed
    // If it is, replace it with allp[0]
    gp := getg()
    if gp.m.p != 0 && gp.m.p.ptr().id < nprocs {
        // Continue using the current P
        gp.m.p.ptr().status = _Prunning
        gp.m.p.ptr().mcache.prepareForSweep()
    } else {
        if gp.m.p != 0 {
            gp.m.p.ptr().m = 0
        }
        gp.m.p = 0
        pp := allp[0]
        pp.m = 0
        pp.status = _Pidle
        acquirep(pp)
    }

    // Clean up old P when the number of procs shrinks
    for i := nprocs; i < old; i++ {
        pp := allp[i]
        pp.destroy()
    }

    // Return the runnable P linked list
    var runnablePs *p
    for i := nprocs - 1; i >= 0; i-- {
        pp := allp[i]
        if gp.m.p.ptr() == pp {
            continue
        }
        pp.status = _Pidle
        if runqempty(pp) {
            // If there are no runnable g on the local queue of p, place it on the idle p mask
            pidleput(pp, now)
        } else {
            // Find an idle m for p, here m may be empty
            pp.m.set(mget())
            pp.link.set(runnablePs)
            runnablePs = pp
        }
    }

    if old != nprocs {
        // If the number of procs changes, modify the proc capacity occupied by GC (default takes 25% of the proc count)
        gcCPULimiter.resetCapacity(now, nprocs)
    }
    return runnablePs
}

runtime.newproc#

newproc takes a function (address) to create a new g and place it on the local queue of the current p, then wakes up the current p.

During the program startup, the address of runtime.main is passed here.

func newproc(fn *funcval) {
    gp := getg()
    pc := getcallerpc()
    systemstack(func() {
        newg := newproc1(fn, gp, pc)
        pp := getg().m.p.ptr()
        runqput(pp, newg, true)
        if mainStarted {
            // mainStarted is set in runtime.main
            wakep()
        }
    })
}

func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
    mp := acquirem()
    pp := mp.p.ptr()
    newg := gfget(pp) // Here it first tries to take one from the freeg linked list of p
    if newg == nil {
        // If not found, create a new g using malg
        // The new g's stack has not been initialized, to avoid being scanned by GC, set the status to dead first
        // Add it to the global allg
        newg = malg(_StackMin)
        casgstatus(newg, _Gidle, _Gdead)
        allgadd(newg)
    }

    // Calculate the stack pointer position
    totalSize := uintptr(4*goarch.PtrSize + sys.MinFrameSize) // extra space in case of reads slightly beyond frame
    totalSize = alignUp(totalSize, sys.StackAlign)
    sp := newg.stack.hi - totalSize

    // Set up newg's scheduling data, stack pointer, function address, program counter, caller information, etc.
    memclrNoHeapPointers(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
    newg.sched.sp = sp
    newg.stktopsp = sp
    newg.sched.pc = abi.FuncPCABI0(goexit) + sys.PCQuantum // +PCQuantum so that previous instruction is in same function
    newg.sched.g = guintptr(unsafe.Pointer(newg))
    gostartcallfn(&newg.sched, fn)
    newg.parentGoid = callergp.goid
    newg.gopc = callerpc
    newg.ancestors = saveAncestors(callergp)
    newg.startpc = fn.fn

    // Change status to runnable and add the stack to the GC stack scan
    casgstatus(newg, _Gdead, _Grunnable)
    gcController.addScannableStack(pp, int64(newg.stack.hi-newg.stack.lo))

    newg.goid = pp.goidcache
    pp.goidcache++

    releasem(mp)

    return newg
}

runtime.mstart0#

func mstart0() {
    gp := getg()

    // Initialize g0's stackguard for stack overflow and stack expansion checks
    gp.stackguard0 = gp.stack.lo + _StackGuard
    gp.stackguard1 = gp.stackguard0
    mstart1()

    mexit(osStack)
}

func mstart1() {
    gp := getg()

    // Set up m.g0.sched as a label returning to just
    // after the mstart1 call in mstart0 above, for use by goexit0 and mcall.
    // We're never coming back to mstart1 after we call schedule,
    // so other calls can reuse the current frame.
    // And goexit0 does a gogo that needs to return from mstart1
    // and let mstart0 exit the thread.
    gp.sched.g = guintptr(unsafe.Pointer(gp))
    gp.sched.pc = getcallerpc()
    gp.sched.sp = getcallersp()

    // Initialize the thread's signal handling coroutine stack and signal mask
    minit()
    if gp.m == &m0 {
        // Set the thread's signal handling function sighandler
        mstartm0()
    }

    // Some internal threads like sysmon start directly here
    if fn := gp.m.mstartfn; fn != nil {
        fn()
    }

    if gp.m != &m0 {
        acquirep(gp.m.nextp.ptr())
        gp.m.nextp = 0
    }

    // Execute the scheduling loop, never returns
    // Here there is currently only one p and one g
    // All will jump to runtime.main
    schedule()
}

func main() {
    mp := getg().m

    // Set the maximum stack size to 1G
    if goarch.PtrSize == 8 {
        maxstacksize = 1000000000
    } else {
        maxstacksize = 250000000
    }

    // The coroutine created by newproc can find or create a thread to execute through wakep
    mainStarted = true

    // Execute the init function under runtime and initialize global variables
    doInit(&runtime_inittask)

    // Enable GC
    gcenable()

    // Execute user-level init function and initialize global variables
    doInit(&main_inittask)

     // -buildmode=c-archive or c-shared does not execute main
    if isarchive || islibrary {
        return
    }

    // Execute the user-level main.main function
    fn := main_main
    fn()

    // Before the main coroutine exits, if there are currently other coroutines handling panic-defer,
    // it needs to wait for other coroutines to finish processing first (such as printing panic information, etc.)
    if runningPanicDefers.Load() != 0 {
        for c := 0; c < 1000; c++ {
            if runningPanicDefers.Load() == 0 {
                break
            }
            Gosched()
        }
    }

    // Execute registered hooks before the process exits
    // For example, output code coverage data in compile -cover mode
    runExitHooks(0)
    exit(0)
}