1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
10 TEXT runtime·rt0_go(SB),NOSPLIT,$0
11 // copy arguments forward on an even stack
14 SUBQ $(4*8+7), SP // 2args 2auto
19 // create istack out of the given (operating system) stack.
20 // _cgo_init may update stackguard.
21 MOVQ $runtime·g0(SB), DI
22 LEAQ (-64*1024+104)(SP), BX
23 MOVQ BX, g_stackguard0(DI)
24 MOVQ BX, g_stackguard1(DI)
25 MOVQ BX, (g_stack+stack_lo)(DI)
26 MOVQ SP, (g_stack+stack_hi)(DI)
28 // find out information about the processor we're on
34 // Figure out how to serialize RDTSC.
35 // On Intel processors LFENCE is enough. AMD requires MFENCE.
36 // Don't know about the rest, so let's do MFENCE.
37 CMPL BX, $0x756E6547 // "Genu"
39 CMPL DX, $0x49656E69 // "ineI"
41 CMPL CX, $0x6C65746E // "ntel"
43 MOVB $1, runtime·lfenceBeforeRdtsc(SB)
48 MOVL CX, runtime·cpuid_ecx(SB)
49 MOVL DX, runtime·cpuid_edx(SB)
52 // if there is an _cgo_init, call it.
53 MOVQ _cgo_init(SB), AX
57 MOVQ DI, CX // Win64 uses CX for first parameter
58 MOVQ $setg_gcc<>(SB), SI
61 // update stackguard after _cgo_init
62 MOVQ $runtime·g0(SB), CX
63 MOVQ (g_stack+stack_lo)(CX), AX
64 ADDQ $const__StackGuard, AX
65 MOVQ AX, g_stackguard0(CX)
66 MOVQ AX, g_stackguard1(CX)
68 CMPL runtime·iswindows(SB), $0
71 // skip TLS setup on Plan 9
72 CMPL runtime·isplan9(SB), $1
74 // skip TLS setup on Solaris
75 CMPL runtime·issolaris(SB), $1
78 LEAQ runtime·tls0(SB), DI
79 CALL runtime·settls(SB)
81 // store through it, to make sure it works
84 MOVQ runtime·tls0(SB), AX
89 // set the per-goroutine and per-mach "registers"
91 LEAQ runtime·g0(SB), CX
93 LEAQ runtime·m0(SB), AX
100 CLD // convention is D is always left cleared
101 CALL runtime·check(SB)
103 MOVL 16(SP), AX // copy argc
105 MOVQ 24(SP), AX // copy argv
107 CALL runtime·args(SB)
108 CALL runtime·osinit(SB)
109 CALL runtime·schedinit(SB)
111 // create a new goroutine to start program
112 MOVQ $runtime·main·f(SB), AX // entry
115 CALL runtime·newproc(SB)
120 CALL runtime·mstart(SB)
122 MOVL $0xf1, 0xf1 // crash
125 DATA runtime·main·f+0(SB)/8,$runtime·main(SB)
126 GLOBL runtime·main·f(SB),RODATA,$8
128 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
132 TEXT runtime·asminit(SB),NOSPLIT,$0-0
133 // No per-thread init.
140 // void gosave(Gobuf*)
141 // save state in Gobuf; setjmp
142 TEXT runtime·gosave(SB), NOSPLIT, $0-8
143 MOVQ buf+0(FP), AX // gobuf
144 LEAQ buf+0(FP), BX // caller's SP
145 MOVQ BX, gobuf_sp(AX)
146 MOVQ 0(SP), BX // caller's PC
147 MOVQ BX, gobuf_pc(AX)
148 MOVQ $0, gobuf_ret(AX)
149 MOVQ $0, gobuf_ctxt(AX)
150 MOVQ BP, gobuf_bp(AX)
157 // restore state from Gobuf; longjmp
158 TEXT runtime·gogo(SB), NOSPLIT, $0-8
159 MOVQ buf+0(FP), BX // gobuf
161 MOVQ 0(DX), CX // make sure g != nil
164 MOVQ gobuf_sp(BX), SP // restore SP
165 MOVQ gobuf_ret(BX), AX
166 MOVQ gobuf_ctxt(BX), DX
167 MOVQ gobuf_bp(BX), BP
168 MOVQ $0, gobuf_sp(BX) // clear to help garbage collector
169 MOVQ $0, gobuf_ret(BX)
170 MOVQ $0, gobuf_ctxt(BX)
171 MOVQ $0, gobuf_bp(BX)
172 MOVQ gobuf_pc(BX), BX
175 // func mcall(fn func(*g))
176 // Switch to m->g0's stack, call fn(g).
177 // Fn must never return. It should gogo(&g->sched)
178 // to keep running g.
179 TEXT runtime·mcall(SB), NOSPLIT, $0-8
183 MOVQ g(CX), AX // save state in g->sched
184 MOVQ 0(SP), BX // caller's PC
185 MOVQ BX, (g_sched+gobuf_pc)(AX)
186 LEAQ fn+0(FP), BX // caller's SP
187 MOVQ BX, (g_sched+gobuf_sp)(AX)
188 MOVQ AX, (g_sched+gobuf_g)(AX)
189 MOVQ BP, (g_sched+gobuf_bp)(AX)
191 // switch to m->g0 & its stack, call fn
195 CMPQ SI, AX // if g == m->g0 call badmcall
197 MOVQ $runtime·badmcall(SB), AX
199 MOVQ SI, g(CX) // g = m->g0
200 MOVQ (g_sched+gobuf_sp)(SI), SP // sp = m->g0->sched.sp
206 MOVQ $runtime·badmcall2(SB), AX
210 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
211 // of the G stack. We need to distinguish the routine that
212 // lives at the bottom of the G stack from the one that lives
213 // at the top of the system stack because the one at the top of
214 // the system stack terminates the stack walk (see topofstack()).
215 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
218 // func systemstack(fn func())
219 TEXT runtime·systemstack(SB), NOSPLIT, $0-8
220 MOVQ fn+0(FP), DI // DI = fn
222 MOVQ g(CX), AX // AX = g
223 MOVQ g_m(AX), BX // BX = m
225 MOVQ m_gsignal(BX), DX // DX = gsignal
229 MOVQ m_g0(BX), DX // DX = g0
237 // Bad: g is not gsignal, not g0, not curg. What is it?
238 MOVQ $runtime·badsystemstack(SB), AX
242 // save our state in g->sched. Pretend to
243 // be systemstack_switch if the G stack is scanned.
244 MOVQ $runtime·systemstack_switch(SB), SI
245 MOVQ SI, (g_sched+gobuf_pc)(AX)
246 MOVQ SP, (g_sched+gobuf_sp)(AX)
247 MOVQ AX, (g_sched+gobuf_g)(AX)
248 MOVQ BP, (g_sched+gobuf_bp)(AX)
252 MOVQ (g_sched+gobuf_sp)(DX), BX
253 // make it look like mstart called systemstack on g0, to stop traceback
255 MOVQ $runtime·mstart(SB), DX
259 // call target function
270 MOVQ (g_sched+gobuf_sp)(AX), SP
271 MOVQ $0, (g_sched+gobuf_sp)(AX)
275 // already on m stack, just call directly
282 * support for morestack
285 // Called during function prolog when more stack is needed.
287 // The traceback routines see morestack on a g0 as being
288 // the top of a stack (for example, morestack calling newstack
289 // calling the scheduler calling newm calling gc), so we must
290 // record an argument size. For that purpose, it has no arguments.
291 TEXT runtime·morestack(SB),NOSPLIT,$0-0
292 // Cannot grow scheduler stack (m->g0).
301 // Cannot grow signal stack (m->gsignal).
302 MOVQ m_gsignal(BX), SI
308 // Set m->morebuf to f's caller.
309 MOVQ 8(SP), AX // f's caller's PC
310 MOVQ AX, (m_morebuf+gobuf_pc)(BX)
311 LEAQ 16(SP), AX // f's caller's SP
312 MOVQ AX, (m_morebuf+gobuf_sp)(BX)
315 MOVQ SI, (m_morebuf+gobuf_g)(BX)
317 // Set g->sched to context in f.
318 MOVQ 0(SP), AX // f's PC
319 MOVQ AX, (g_sched+gobuf_pc)(SI)
320 MOVQ SI, (g_sched+gobuf_g)(SI)
321 LEAQ 8(SP), AX // f's SP
322 MOVQ AX, (g_sched+gobuf_sp)(SI)
323 MOVQ DX, (g_sched+gobuf_ctxt)(SI)
324 MOVQ BP, (g_sched+gobuf_bp)(SI)
326 // Call newstack on m->g0's stack.
329 MOVQ (g_sched+gobuf_sp)(BX), SP
330 CALL runtime·newstack(SB)
331 MOVQ $0, 0x1003 // crash if newstack returns
334 // morestack but not preserving ctxt.
335 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
337 JMP runtime·morestack(SB)
339 // reflectcall: call a function with the given argument list
340 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
341 // we don't have variable-sized frames, so we use a small number
342 // of constant-sized-frame functions to encode a few bits of size in the pc.
343 // Caution: ugly multiline assembly macros in your future!
345 #define DISPATCH(NAME,MAXSIZE) \
348 MOVQ $NAME(SB), AX; \
350 // Note: can't just "JMP NAME(SB)" - bad inlining results.
352 TEXT reflect·call(SB), NOSPLIT, $0-0
355 TEXT ·reflectcall(SB), NOSPLIT, $0-32
356 MOVLQZX argsize+24(FP), CX
357 // NOTE(rsc): No call16, because CALLFN needs four words
358 // of argument space to invoke callwritebarrier.
359 DISPATCH(runtime·call32, 32)
360 DISPATCH(runtime·call64, 64)
361 DISPATCH(runtime·call128, 128)
362 DISPATCH(runtime·call256, 256)
363 DISPATCH(runtime·call512, 512)
364 DISPATCH(runtime·call1024, 1024)
365 DISPATCH(runtime·call2048, 2048)
366 DISPATCH(runtime·call4096, 4096)
367 DISPATCH(runtime·call8192, 8192)
368 DISPATCH(runtime·call16384, 16384)
369 DISPATCH(runtime·call32768, 32768)
370 DISPATCH(runtime·call65536, 65536)
371 DISPATCH(runtime·call131072, 131072)
372 DISPATCH(runtime·call262144, 262144)
373 DISPATCH(runtime·call524288, 524288)
374 DISPATCH(runtime·call1048576, 1048576)
375 DISPATCH(runtime·call2097152, 2097152)
376 DISPATCH(runtime·call4194304, 4194304)
377 DISPATCH(runtime·call8388608, 8388608)
378 DISPATCH(runtime·call16777216, 16777216)
379 DISPATCH(runtime·call33554432, 33554432)
380 DISPATCH(runtime·call67108864, 67108864)
381 DISPATCH(runtime·call134217728, 134217728)
382 DISPATCH(runtime·call268435456, 268435456)
383 DISPATCH(runtime·call536870912, 536870912)
384 DISPATCH(runtime·call1073741824, 1073741824)
385 MOVQ $runtime·badreflectcall(SB), AX
388 #define CALLFN(NAME,MAXSIZE) \
389 TEXT NAME(SB), WRAPPER, $MAXSIZE-32; \
391 /* copy arguments to stack */ \
392 MOVQ argptr+16(FP), SI; \
393 MOVLQZX argsize+24(FP), CX; \
396 /* call function */ \
398 PCDATA $PCDATA_StackMapIndex, $0; \
400 /* copy return values back */ \
401 MOVQ argptr+16(FP), DI; \
402 MOVLQZX argsize+24(FP), CX; \
403 MOVLQZX retoffset+28(FP), BX; \
409 /* execute write barrier updates */ \
410 MOVQ argtype+0(FP), DX; \
411 MOVQ argptr+16(FP), DI; \
412 MOVLQZX argsize+24(FP), CX; \
413 MOVLQZX retoffset+28(FP), BX; \
418 CALL runtime·callwritebarrier(SB); \
423 CALLFN(·call128, 128)
424 CALLFN(·call256, 256)
425 CALLFN(·call512, 512)
426 CALLFN(·call1024, 1024)
427 CALLFN(·call2048, 2048)
428 CALLFN(·call4096, 4096)
429 CALLFN(·call8192, 8192)
430 CALLFN(·call16384, 16384)
431 CALLFN(·call32768, 32768)
432 CALLFN(·call65536, 65536)
433 CALLFN(·call131072, 131072)
434 CALLFN(·call262144, 262144)
435 CALLFN(·call524288, 524288)
436 CALLFN(·call1048576, 1048576)
437 CALLFN(·call2097152, 2097152)
438 CALLFN(·call4194304, 4194304)
439 CALLFN(·call8388608, 8388608)
440 CALLFN(·call16777216, 16777216)
441 CALLFN(·call33554432, 33554432)
442 CALLFN(·call67108864, 67108864)
443 CALLFN(·call134217728, 134217728)
444 CALLFN(·call268435456, 268435456)
445 CALLFN(·call536870912, 536870912)
446 CALLFN(·call1073741824, 1073741824)
448 // bool cas(int32 *val, int32 old, int32 new)
455 TEXT runtime·cas(SB), NOSPLIT, $0-17
464 // bool runtime·cas64(uint64 *val, uint64 old, uint64 new)
472 TEXT runtime·cas64(SB), NOSPLIT, $0-25
481 TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
482 JMP runtime·cas64(SB)
484 TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-16
485 JMP runtime·atomicload64(SB)
487 TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-16
488 JMP runtime·atomicload64(SB)
490 TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
491 JMP runtime·atomicstore64(SB)
493 // bool casp(void **val, void *old, void *new)
500 TEXT runtime·casp1(SB), NOSPLIT, $0-25
509 // uint32 xadd(uint32 volatile *val, int32 delta)
513 TEXT runtime·xadd(SB), NOSPLIT, $0-20
523 TEXT runtime·xadd64(SB), NOSPLIT, $0-24
533 TEXT runtime·xchg(SB), NOSPLIT, $0-20
540 TEXT runtime·xchg64(SB), NOSPLIT, $0-24
547 TEXT runtime·xchgp1(SB), NOSPLIT, $0-24
554 TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
555 JMP runtime·xchg64(SB)
557 TEXT runtime·procyield(SB),NOSPLIT,$0-0
558 MOVL cycles+0(FP), AX
565 TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
571 TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
577 TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
583 // void runtime·atomicor8(byte volatile*, byte);
584 TEXT runtime·atomicor8(SB), NOSPLIT, $0-9
591 // void jmpdefer(fn, sp);
592 // called from deferreturn.
594 // 2. sub 5 bytes from the callers return
595 // 3. jmp to the argument
596 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
597 MOVQ fv+0(FP), DX // fn
598 MOVQ argp+8(FP), BX // caller sp
599 LEAQ -8(BX), SP // caller sp after CALL
600 SUBQ $5, (SP) // return to CALL again
602 JMP BX // but first run the deferred function
604 // Save state of caller into g->sched. Smashes R8, R9.
605 TEXT gosave<>(SB),NOSPLIT,$0
609 MOVQ R9, (g_sched+gobuf_pc)(R8)
611 MOVQ R9, (g_sched+gobuf_sp)(R8)
612 MOVQ $0, (g_sched+gobuf_ret)(R8)
613 MOVQ $0, (g_sched+gobuf_ctxt)(R8)
614 MOVQ BP, (g_sched+gobuf_bp)(R8)
617 // asmcgocall(void(*fn)(void*), void *arg)
618 // Call fn(arg) on the scheduler stack,
619 // aligned appropriately for the gcc ABI.
620 // See cgocall.c for more details.
621 TEXT ·asmcgocall(SB),NOSPLIT,$0-16
624 CALL asmcgocall<>(SB)
627 TEXT ·asmcgocall_errno(SB),NOSPLIT,$0-20
630 CALL asmcgocall<>(SB)
634 // asmcgocall common code. fn in AX, arg in BX. returns errno in AX.
635 TEXT asmcgocall<>(SB),NOSPLIT,$0-0
638 // Figure out if we need to switch to m->g0 stack.
639 // We get called to create new OS threads too, and those
640 // come in on the m->g0 stack already.
648 MOVQ m_gsignal(R8), SI
655 MOVQ (g_sched+gobuf_sp)(SI), SP
658 // Now on a scheduling stack (a pthread-created stack).
659 // Make sure we have enough room for 4 stack-backed fast-call
660 // registers as per windows amd64 calling convention.
662 ANDQ $~15, SP // alignment for gcc ABI
663 MOVQ DI, 48(SP) // save g
664 MOVQ (g_stack+stack_hi)(DI), DI
666 MOVQ DI, 40(SP) // save depth in stack (can't just save SP, as stack might be copied during a callback)
667 MOVQ BX, DI // DI = first argument in AMD64 ABI
668 MOVQ BX, CX // CX = first argument in Win64
671 // Restore registers, g, stack pointer.
674 MOVQ (g_stack+stack_hi)(DI), SI
680 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
681 // Turn the fn into a Go func (by taking its address) and call
682 // cgocallback_gofunc.
683 TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
688 MOVQ framesize+16(FP), AX
690 MOVQ $runtime·cgocallback_gofunc(SB), AX
694 // cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
695 // See cgocall.c for more details.
696 TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24
699 // If g is nil, Go did not create the current thread.
700 // Call needm to obtain one m for temporary use.
701 // In this case, we're running on the thread stack, so there's
702 // lots of space, but the linker doesn't know. Hide the call from
703 // the linker analysis by using an indirect call through AX.
714 MOVQ BX, R8 // holds oldm until end of function
718 MOVQ $runtime·needm(SB), AX
725 // Set m->sched.sp = SP, so that if a panic happens
726 // during the function we are about to execute, it will
727 // have a valid SP to run on the g0 stack.
728 // The next few lines (after the havem label)
729 // will save this SP onto the stack and then write
730 // the same SP back to m->sched.sp. That seems redundant,
731 // but if an unrecovered panic happens, unwindm will
732 // restore the g->sched.sp from the stack location
733 // and then systemstack will try to use it. If we don't set it here,
734 // that restored SP will be uninitialized (typically 0) and
735 // will not be usable.
737 MOVQ SP, (g_sched+gobuf_sp)(SI)
740 // Now there's a valid m, and we're running on its m->g0.
741 // Save current m->g0->sched.sp on stack and then set it to SP.
742 // Save current sp in m->g0->sched.sp in preparation for
743 // switch back to m->curg stack.
744 // NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
746 MOVQ (g_sched+gobuf_sp)(SI), AX
748 MOVQ SP, (g_sched+gobuf_sp)(SI)
750 // Switch to m->curg stack and call runtime.cgocallbackg.
751 // Because we are taking over the execution of m->curg
752 // but *not* resuming what had been running, we need to
753 // save that information (m->curg->sched) so we can restore it.
754 // We can restore m->curg->sched.sp easily, because calling
755 // runtime.cgocallbackg leaves SP unchanged upon return.
756 // To save m->curg->sched.pc, we push it onto the stack.
757 // This has the added benefit that it looks to the traceback
758 // routine like cgocallbackg is going to return to that
759 // PC (because the frame we allocate below has the same
760 // size as cgocallback_gofunc's frame declared above)
761 // so that the traceback will seamlessly trace back into
762 // the earlier calls.
764 // In the new goroutine, 0(SP) holds the saved R8.
767 MOVQ (g_sched+gobuf_sp)(SI), DI // prepare stack as DI
768 MOVQ (g_sched+gobuf_pc)(SI), BX
770 // Compute the size of the frame, including return PC and, if
771 // GOEXPERIMENT=framepointer, the saved based pointer
778 CALL runtime·cgocallbackg(SB)
781 // Compute the size of the frame again. FP and SP have
782 // completely different values here than they did above,
783 // but only their difference matters.
787 // Restore g->sched (== m->curg->sched) from saved values.
793 MOVQ BX, (g_sched+gobuf_pc)(SI)
794 MOVQ DI, (g_sched+gobuf_sp)(SI)
796 // Switch back to m->g0's stack and restore m->g0->sched.sp.
797 // (Unlike m->curg, the g0 goroutine never uses sched.pc,
798 // so we do not have to restore it.)
803 MOVQ (g_sched+gobuf_sp)(SI), SP
805 MOVQ AX, (g_sched+gobuf_sp)(SI)
807 // If the m on entry was nil, we called needm above to borrow an m
808 // for the duration of the call. Since the call is over, return it with dropm.
811 MOVQ $runtime·dropm(SB), AX
817 // void setg(G*); set g. for use by needm.
818 TEXT runtime·setg(SB), NOSPLIT, $0-8
834 // void setg_gcc(G*); set g called from gcc.
835 TEXT setg_gcc<>(SB),NOSPLIT,$0
840 // check that SP is in range [g->stack.lo, g->stack.hi)
841 TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
844 CMPQ (g_stack+stack_hi)(AX), SP
847 CMPQ SP, (g_stack+stack_lo)(AX)
852 TEXT runtime·getcallerpc(SB),NOSPLIT,$0-16
853 MOVQ argp+0(FP),AX // addr of first arg
854 MOVQ -8(AX),AX // get calling pc
858 TEXT runtime·gogetcallerpc(SB),NOSPLIT,$0-16
859 MOVQ p+0(FP),AX // addr of first arg
860 MOVQ -8(AX),AX // get calling pc
864 TEXT runtime·setcallerpc(SB),NOSPLIT,$0-16
865 MOVQ argp+0(FP),AX // addr of first arg
867 MOVQ BX, -8(AX) // set calling pc
870 TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
875 // func gogetcallersp(p unsafe.Pointer) uintptr
876 TEXT runtime·gogetcallersp(SB),NOSPLIT,$0-16
877 MOVQ p+0(FP),AX // addr of first arg
881 // func cputicks() int64
882 TEXT runtime·cputicks(SB),NOSPLIT,$0-0
883 CMPB runtime·lfenceBeforeRdtsc(SB), $1
885 BYTE $0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
888 BYTE $0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
896 // memhash_varlen(p unsafe.Pointer, h seed) uintptr
897 // redirects to memhash(p, h, size) using the size
898 // stored in the closure.
899 TEXT runtime·memhash_varlen(SB),NOSPLIT,$32-24
908 CALL runtime·memhash(SB)
913 // hash function using AES hardware instructions
914 TEXT runtime·aeshash(SB),NOSPLIT,$0-32
915 MOVQ p+0(FP), AX // ptr to data
916 MOVQ s+16(FP), CX // size
918 JMP runtime·aeshashbody(SB)
920 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
921 MOVQ p+0(FP), AX // ptr to string struct
922 MOVQ 8(AX), CX // length of string
923 MOVQ (AX), AX // string data
925 JMP runtime·aeshashbody(SB)
929 // DX: address to put return value
930 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
931 MOVQ h+8(FP), X6 // seed to low 64 bits of xmm6
932 PINSRQ $1, CX, X6 // size to high 64 bits of xmm6
933 PSHUFHW $0, X6, X6 // replace size with its low 2 bytes repeated 4 times
934 MOVO runtime·aeskeysched(SB), X7
954 // 16 bytes loaded at this address won't cross
955 // a page boundary, so we can load it directly.
958 MOVQ $masks<>(SB), AX
969 // address ends in 1111xxxx. Might be up against
970 // a page boundary, so load ending at last byte.
971 // Then shift bytes down using pshufb.
972 MOVOU -32(AX)(CX*1), X0
974 MOVQ $shifts<>(SB), AX
975 PSHUFB (AX)(CX*8), X0
997 // load data to be hashed
999 MOVOU -16(AX)(CX*1), X1
1003 AESENC runtime·aeskeysched+16(SB), X1
1017 MOVOU -32(AX)(CX*1), X2
1018 MOVOU -16(AX)(CX*1), X3
1021 AESENC runtime·aeskeysched+16(SB), X1
1022 AESENC runtime·aeskeysched+32(SB), X2
1023 AESENC runtime·aeskeysched+48(SB), X3
1044 MOVOU -64(AX)(CX*1), X4
1045 MOVOU -48(AX)(CX*1), X5
1046 MOVOU -32(AX)(CX*1), X8
1047 MOVOU -16(AX)(CX*1), X9
1050 AESENC runtime·aeskeysched+16(SB), X1
1051 AESENC runtime·aeskeysched+32(SB), X2
1052 AESENC runtime·aeskeysched+48(SB), X3
1053 AESENC runtime·aeskeysched+64(SB), X4
1054 AESENC runtime·aeskeysched+80(SB), X5
1055 AESENC runtime·aeskeysched+96(SB), X8
1056 AESENC runtime·aeskeysched+112(SB), X9
1085 // start with last (possibly overlapping) block
1086 MOVOU -128(AX)(CX*1), X0
1087 MOVOU -112(AX)(CX*1), X1
1088 MOVOU -96(AX)(CX*1), X2
1089 MOVOU -80(AX)(CX*1), X3
1090 MOVOU -64(AX)(CX*1), X4
1091 MOVOU -48(AX)(CX*1), X5
1092 MOVOU -32(AX)(CX*1), X8
1093 MOVOU -16(AX)(CX*1), X9
1095 // scramble state once
1097 AESENC runtime·aeskeysched+16(SB), X1
1098 AESENC runtime·aeskeysched+32(SB), X2
1099 AESENC runtime·aeskeysched+48(SB), X3
1100 AESENC runtime·aeskeysched+64(SB), X4
1101 AESENC runtime·aeskeysched+80(SB), X5
1102 AESENC runtime·aeskeysched+96(SB), X8
1103 AESENC runtime·aeskeysched+112(SB), X9
1105 // compute number of remaining 128-byte blocks
1110 // scramble state, xor in a block
1142 // 2 more scrambles to finish
1170 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
1171 MOVQ p+0(FP), AX // ptr to data
1172 MOVQ h+8(FP), X0 // seed
1173 PINSRD $2, (AX), X0 // data
1174 AESENC runtime·aeskeysched+0(SB), X0
1175 AESENC runtime·aeskeysched+16(SB), X0
1176 AESENC runtime·aeskeysched+32(SB), X0
1180 TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
1181 MOVQ p+0(FP), AX // ptr to data
1182 MOVQ h+8(FP), X0 // seed
1183 PINSRQ $1, (AX), X0 // data
1184 AESENC runtime·aeskeysched+0(SB), X0
1185 AESENC runtime·aeskeysched+16(SB), X0
1186 AESENC runtime·aeskeysched+32(SB), X0
1190 // simple mask to get rid of data in the high part of the register.
1191 DATA masks<>+0x00(SB)/8, $0x0000000000000000
1192 DATA masks<>+0x08(SB)/8, $0x0000000000000000
1193 DATA masks<>+0x10(SB)/8, $0x00000000000000ff
1194 DATA masks<>+0x18(SB)/8, $0x0000000000000000
1195 DATA masks<>+0x20(SB)/8, $0x000000000000ffff
1196 DATA masks<>+0x28(SB)/8, $0x0000000000000000
1197 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
1198 DATA masks<>+0x38(SB)/8, $0x0000000000000000
1199 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
1200 DATA masks<>+0x48(SB)/8, $0x0000000000000000
1201 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
1202 DATA masks<>+0x58(SB)/8, $0x0000000000000000
1203 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
1204 DATA masks<>+0x68(SB)/8, $0x0000000000000000
1205 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
1206 DATA masks<>+0x78(SB)/8, $0x0000000000000000
1207 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
1208 DATA masks<>+0x88(SB)/8, $0x0000000000000000
1209 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
1210 DATA masks<>+0x98(SB)/8, $0x00000000000000ff
1211 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
1212 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
1213 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
1214 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
1215 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
1216 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
1217 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
1218 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
1219 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
1220 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
1221 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
1222 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
1223 GLOBL masks<>(SB),RODATA,$256
1225 // these are arguments to pshufb. They move data down from
1226 // the high bytes of the register to the low bytes of the register.
1227 // index is how many bytes to move.
1228 DATA shifts<>+0x00(SB)/8, $0x0000000000000000
1229 DATA shifts<>+0x08(SB)/8, $0x0000000000000000
1230 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
1231 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
1232 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
1233 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
1234 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
1235 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
1236 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
1237 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
1238 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
1239 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
1240 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
1241 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
1242 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
1243 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
1244 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
1245 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
1246 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
1247 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
1248 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
1249 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
1250 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
1251 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
1252 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
1253 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
1254 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
1255 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
1256 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
1257 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
1258 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
1259 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
1260 GLOBL shifts<>(SB),RODATA,$256
1262 TEXT runtime·memeq(SB),NOSPLIT,$0-25
1265 MOVQ size+16(FP), BX
1266 CALL runtime·memeqbody(SB)
1270 // memequal_varlen(a, b unsafe.Pointer) bool
1271 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
1276 MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure
1277 CALL runtime·memeqbody(SB)
1284 // eqstring tests whether two strings are equal.
1285 // The compiler guarantees that strings passed
1286 // to eqstring have equal length.
1287 // See runtime_test.go:eqstring_generic for
1288 // equivalent Go code.
1289 TEXT runtime·eqstring(SB),NOSPLIT,$0-33
1290 MOVQ s1str+0(FP), SI
1291 MOVQ s2str+16(FP), DI
1294 MOVQ s1len+8(FP), BX
1295 CALL runtime·memeqbody(SB)
1305 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
1311 // 64 bytes at a time using xmm registers
1338 // 8 bytes at a time using 64-bit register
1351 // remaining 0-8 bytes
1353 MOVQ -8(SI)(BX*1), CX
1354 MOVQ -8(DI)(BX*1), DX
1369 // load at SI won't cross a page boundary.
1373 // address ends in 11111xxx. Load up to bytes we want, move to correct position.
1374 MOVQ -8(SI)(BX*1), SI
1384 MOVQ -8(DI)(BX*1), DI
1394 TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
1395 MOVQ s1_base+0(FP), SI
1396 MOVQ s1_len+8(FP), BX
1397 MOVQ s2_base+16(FP), DI
1398 MOVQ s2_len+24(FP), DX
1399 CALL runtime·cmpbody(SB)
1403 TEXT bytes·Compare(SB),NOSPLIT,$0-56
1408 CALL runtime·cmpbody(SB)
1419 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
1424 CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
1435 XORQ $0xffff, AX // convert EQ to NE
1436 JNE diff16 // branch if at least one byte is not equal
1442 // AX = bit mask of differences
1444 BSFQ AX, BX // index of first byte that differs
1449 LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
1452 // 0 through 16 bytes left, alen>=8, blen>=8
1461 MOVQ -8(SI)(R8*1), AX
1462 MOVQ -8(DI)(R8*1), CX
1466 // AX and CX contain parts of a and b that differ.
1468 BSWAPQ AX // reverse order of bytes
1471 BSRQ CX, CX // index of highest bit difference
1472 SHRQ CX, AX // move a's bit to bottom
1473 ANDQ $1, AX // mask bit
1474 LEAQ -1(AX*2), AX // 1/0 => +1/-1
1477 // 0-7 bytes in common
1479 LEAQ (R8*8), CX // bytes left -> bits left
1480 NEGQ CX // - bits lift (== 64 - bits left mod 64)
1483 // load bytes of a into high bytes of AX
1489 MOVQ -8(SI)(R8*1), SI
1494 // load bytes of b in to high bytes of BX
1500 MOVQ -8(DI)(R8*1), DI
1505 BSWAPQ SI // reverse order of bytes
1507 XORQ SI, DI // find bit differences
1509 BSRQ DI, CX // index of highest bit difference
1510 SHRQ CX, SI // move a's bit to bottom
1511 ANDQ $1, SI // mask bit
1512 LEAQ -1(SI*2), AX // 1/0 => +1/-1
1519 SETGT AX // 1 if alen > blen
1520 SETEQ CX // 1 if alen == blen
1521 LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
1524 TEXT bytes·IndexByte(SB),NOSPLIT,$0
1526 MOVQ s_len+8(FP), BX
1528 CALL runtime·indexbytebody(SB)
1532 TEXT strings·IndexByte(SB),NOSPLIT,$0
1534 MOVQ s_len+8(FP), BX
1536 CALL runtime·indexbytebody(SB)
1546 TEXT runtime·indexbytebody(SB),NOSPLIT,$0
1552 // round up to first 16-byte boundary
1559 // search the beginning
1564 // DI is 16-byte aligned; get ready to search using SSE instructions
1566 // round down to last 16-byte boundary
1571 // shuffle X0 around so that each byte contains c
1579 // move the next 16-byte chunk of the buffer into X1
1581 // compare bytes in X0 to X1
1583 // take the top bit of each byte in X1 and put the result in DX
1597 // if CX == 0, the zero flag will be set and we'll end up
1598 // returning a false success
1607 // handle for lengths < 16
1615 // we've found the chunk containing the byte
1616 // now just figure out which specific byte it is
1618 // get the index of the least significant set bit
1631 TEXT bytes·Equal(SB),NOSPLIT,$0-49
1632 MOVQ a_len+8(FP), BX
1633 MOVQ b_len+32(FP), CX
1639 CALL runtime·memeqbody(SB)
1644 // A Duff's device for zeroing memory.
1645 // The compiler jumps to computed addresses within
1646 // this routine to zero chunks of memory. Do not
1647 // change this code without also changing the code
1648 // in ../../cmd/6g/ggen.c:clearfat.
1650 // DI: ptr to memory to be zeroed
1651 // DI is updated as a side effect.
1652 TEXT runtime·duffzero(SB), NOSPLIT, $0-0
1783 // A Duff's device for copying memory.
1784 // The compiler jumps to computed addresses within
1785 // this routine to copy chunks of memory. Source
1786 // and destination must not overlap. Do not
1787 // change this code without also changing the code
1788 // in ../../cmd/6g/cgen.c:sgen.
1789 // SI: ptr to source memory
1790 // DI: ptr to destination memory
1791 // SI and DI are updated as a side effect.
1793 // NOTE: this is equivalent to a sequence of MOVSQ but
1794 // for some reason that is 3.5x slower than this code.
1795 // The STOSQ above seem fine, though.
1796 TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
2439 TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
2443 MOVL m_fastrand(AX), DX
2446 XORL $0x88888eef, DX
2448 MOVL DX, m_fastrand(AX)
2452 TEXT runtime·return0(SB), NOSPLIT, $0
2457 // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
2458 // Must obey the gcc calling convention.
2459 TEXT _cgo_topofstack(SB),NOSPLIT,$0
2464 MOVQ (g_stack+stack_hi)(AX), AX
2467 // The top-most function running on a goroutine
2468 // returns to goexit+PCQuantum.
2469 TEXT runtime·goexit(SB),NOSPLIT,$0-0
2471 CALL runtime·goexit1(SB) // does not return
2473 TEXT runtime·getg(SB),NOSPLIT,$0-8
2479 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
2484 TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
2489 TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
2494 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8