// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// CPU profiling.
// Based on algorithms and data structures used in
// https://github.com/google/pprof.
//
// The main difference between this code and the google-perftools
// code is that this code is written to allow copying the profile data
// to an arbitrary io.Writer, while the google-perftools code always
// writes to an operating system file.
//
// The signal handler for the profiling clock tick adds a new stack trace
// to a hash table tracking counts for recent traces. Most clock ticks
// hit in the cache. In the event of a cache miss, an entry must be
// evicted from the hash table, copied to a log that will eventually be
// written as profile data. The google-perftools code flushed the
// log itself during the signal handler. This code cannot do that, because
// the io.Writer might block or need system calls or locks that are not
// safe to use from within the signal handler. Instead, we split the log
// into two halves and let the signal handler fill one half while a goroutine
// is writing out the other half. When the signal handler fills its half, it
// offers to swap with the goroutine. If the writer is not done with its half,
// we lose the stack trace for this clock tick (and record that loss).
// The goroutine interacts with the signal handler by calling getprofile() to
// get the next log piece to write, implicitly handing back the last log
// piece it obtained.
//
// The state of this dance between the signal handler and the goroutine
// is encoded in the Profile.handoff field. If handoff == 0, then the goroutine
// is not using either log half and is waiting (or will soon be waiting) for
// a new piece by calling notesleep(&p.wait).  If the signal handler
// changes handoff from 0 to non-zero, it must call notewakeup(&p.wait)
// to wake the goroutine. The value indicates the number of entries in the
// log half being handed off. The goroutine leaves the non-zero value in
// place until it has finished processing the log half and then flips the number
// back to zero. Setting the high bit in handoff means that the profiling is over,
// and the goroutine is now in charge of flushing the data left in the hash table
// to the log and returning that data.
//
// The handoff field is manipulated using atomic operations.
// For the most part, the manipulation of handoff is orderly: if handoff == 0
// then the signal handler owns it and can change it to non-zero.
// If handoff != 0 then the goroutine owns it and can change it to zero.
// If that were the end of the story then we would not need to manipulate
// handoff using atomic operations. The operations are needed, however,
// in order to let the log closer set the high bit to indicate "EOF" safely
// in the situation when normally the goroutine "owns" handoff.

package runtime

import (
	"runtime/internal/atomic"
	"unsafe"
)

const (
	numBuckets      = 1 << 10
	logSize         = 1 << 17
	assoc           = 4
	maxCPUProfStack = 64
)

type cpuprofEntry struct {
	count uintptr
	depth int
	stack [maxCPUProfStack]uintptr
}

//go:notinheap
type cpuProfile struct {
	on     bool    // profiling is on
	wait   note    // goroutine waits here
	count  uintptr // tick count
	evicts uintptr // eviction count
	lost   uintptr // lost ticks that need to be logged

	// Active recent stack traces.
	hash [numBuckets]struct {
		entry [assoc]cpuprofEntry
	}

	// Log of traces evicted from hash.
	// Signal handler has filled log[toggle][:nlog].
	// Goroutine is writing log[1-toggle][:handoff].
	log     [2][logSize / 2]uintptr
	nlog    int
	toggle  int32
	handoff uint32

	// Writer state.
	// Writer maintains its own toggle to avoid races
	// looking at signal handler's toggle.
	wtoggle  uint32
	wholding bool // holding & need to release a log half
	flushing bool // flushing hash table - profile is over
	eodSent  bool // special end-of-data record sent; => flushing
}

var (
	cpuprofLock mutex
	cpuprof     *cpuProfile

	eod = [3]uintptr{0, 1, 0}
)

func setcpuprofilerate(hz int32) {
	systemstack(func() {
		setcpuprofilerate_m(hz)
	})
}

// lostProfileData is a no-op function used in profiles
// to mark the number of profiling stack traces that were
// discarded due to slow data writers.
func lostProfileData() {}

// SetCPUProfileRate sets the CPU profiling rate to hz samples per second.
// If hz <= 0, SetCPUProfileRate turns off profiling.
// If the profiler is on, the rate cannot be changed without first turning it off.
//
// Most clients should use the runtime/pprof package or
// the testing package's -test.cpuprofile flag instead of calling
// SetCPUProfileRate directly.
func SetCPUProfileRate(hz int) {
	// Clamp hz to something reasonable.
	if hz < 0 {
		hz = 0
	}
	if hz > 1000000 {
		hz = 1000000
	}

	lock(&cpuprofLock)
	if hz > 0 {
		if cpuprof == nil {
			cpuprof = (*cpuProfile)(sysAlloc(unsafe.Sizeof(cpuProfile{}), &memstats.other_sys))
			if cpuprof == nil {
				print("runtime: cpu profiling cannot allocate memory\n")
				unlock(&cpuprofLock)
				return
			}
		}
		if cpuprof.on || cpuprof.handoff != 0 {
			print("runtime: cannot set cpu profile rate until previous profile has finished.\n")
			unlock(&cpuprofLock)
			return
		}

		cpuprof.on = true
		// pprof binary header format.
		// https://github.com/gperftools/gperftools/blob/master/src/profiledata.cc#L119
		p := &cpuprof.log[0]
		p[0] = 0                 // count for header
		p[1] = 3                 // depth for header
		p[2] = 0                 // version number
		p[3] = uintptr(1e6 / hz) // period (microseconds)
		p[4] = 0
		cpuprof.nlog = 5
		cpuprof.toggle = 0
		cpuprof.wholding = false
		cpuprof.wtoggle = 0
		cpuprof.flushing = false
		cpuprof.eodSent = false
		noteclear(&cpuprof.wait)

		setcpuprofilerate(int32(hz))
	} else if cpuprof != nil && cpuprof.on {
		setcpuprofilerate(0)
		cpuprof.on = false

		// Now add is not running anymore, and getprofile owns the entire log.
		// Set the high bit in cpuprof.handoff to tell getprofile.
		for {
			n := cpuprof.handoff
			if n&0x80000000 != 0 {
				print("runtime: setcpuprofile(off) twice\n")
			}
			if atomic.Cas(&cpuprof.handoff, n, n|0x80000000) {
				if n == 0 {
					// we did the transition from 0 -> nonzero so we wake getprofile
					notewakeup(&cpuprof.wait)
				}
				break
			}
		}
	}
	unlock(&cpuprofLock)
}

// add adds the stack trace to the profile.
// It is called from signal handlers and other limited environments
// and cannot allocate memory or acquire locks that might be
// held at the time of the signal, nor can it use substantial amounts
// of stack. It is allowed to call evict.
//go:nowritebarrierrec
func (p *cpuProfile) add(pc []uintptr) {
	p.addWithFlushlog(pc, p.flushlog)
}

// addWithFlushlog implements add and addNonGo.
// It is called from signal handlers and other limited environments
// and cannot allocate memory or acquire locks that might be
// held at the time of the signal, nor can it use substantial amounts
// of stack. It may be called by a signal handler with no g or m.
// It is allowed to call evict, passing the flushlog parameter.
//go:nosplit
//go:nowritebarrierrec
func (p *cpuProfile) addWithFlushlog(pc []uintptr, flushlog func() bool) {
	if len(pc) > maxCPUProfStack {
		pc = pc[:maxCPUProfStack]
	}

	// Compute hash.
	h := uintptr(0)
	for _, x := range pc {
		h = h<<8 | (h >> (8 * (unsafe.Sizeof(h) - 1)))
		h += x * 41
	}
	p.count++

	// Add to entry count if already present in table.
	b := &p.hash[h%numBuckets]
Assoc:
	for i := range b.entry {
		e := &b.entry[i]
		if e.depth != len(pc) {
			continue
		}
		for j := range pc {
			if e.stack[j] != pc[j] {
				continue Assoc
			}
		}
		e.count++
		return
	}

	// Evict entry with smallest count.
	var e *cpuprofEntry
	for i := range b.entry {
		if e == nil || b.entry[i].count < e.count {
			e = &b.entry[i]
		}
	}
	if e.count > 0 {
		if !p.evict(e, flushlog) {
			// Could not evict entry. Record lost stack.
			p.lost++
			return
		}
		p.evicts++
	}

	// Reuse the newly evicted entry.
	e.depth = len(pc)
	e.count = 1
	copy(e.stack[:], pc)
}

// evict copies the given entry's data into the log, so that
// the entry can be reused.  evict is called from add, which
// is called from the profiling signal handler, so it must not
// allocate memory or block, and it may be called with no g or m.
// It is safe to call flushlog. evict returns true if the entry was
// copied to the log, false if there was no room available.
//go:nosplit
//go:nowritebarrierrec
func (p *cpuProfile) evict(e *cpuprofEntry, flushlog func() bool) bool {
	d := e.depth
	nslot := d + 2
	log := &p.log[p.toggle]
	if p.nlog+nslot > len(log) {
		if !flushlog() {
			return false
		}
		log = &p.log[p.toggle]
	}

	q := p.nlog
	log[q] = e.count
	q++
	log[q] = uintptr(d)
	q++
	copy(log[q:], e.stack[:d])
	q += d
	p.nlog = q
	e.count = 0
	return true
}

// flushlog tries to flush the current log and switch to the other one.
// flushlog is called from evict, called from add, called from the signal handler,
// so it cannot allocate memory or block. It can try to swap logs with
// the writing goroutine, as explained in the comment at the top of this file.
//go:nowritebarrierrec
func (p *cpuProfile) flushlog() bool {
	if !atomic.Cas(&p.handoff, 0, uint32(p.nlog)) {
		return false
	}
	notewakeup(&p.wait)

	p.toggle = 1 - p.toggle
	log := &p.log[p.toggle]
	q := 0
	if p.lost > 0 {
		lostPC := funcPC(lostProfileData)
		log[0] = p.lost
		log[1] = 1
		log[2] = lostPC
		q = 3
		p.lost = 0
	}
	p.nlog = q
	return true
}

// addNonGo is like add, but runs on a non-Go thread.
// It can't do anything that might need a g or an m.
// With this entry point, we don't try to flush the log when evicting an
// old entry. Instead, we just drop the stack trace if we're out of space.
//go:nosplit
//go:nowritebarrierrec
func (p *cpuProfile) addNonGo(pc []uintptr) {
	p.addWithFlushlog(pc, func() bool { return false })
}

// getprofile blocks until the next block of profiling data is available
// and returns it as a []byte. It is called from the writing goroutine.
func (p *cpuProfile) getprofile() []byte {
	if p == nil {
		return nil
	}

	if p.wholding {
		// Release previous log to signal handling side.
		// Loop because we are racing against SetCPUProfileRate(0).
		for {
			n := p.handoff
			if n == 0 {
				print("runtime: phase error during cpu profile handoff\n")
				return nil
			}
			if n&0x80000000 != 0 {
				p.wtoggle = 1 - p.wtoggle
				p.wholding = false
				p.flushing = true
				goto Flush
			}
			if atomic.Cas(&p.handoff, n, 0) {
				break
			}
		}
		p.wtoggle = 1 - p.wtoggle
		p.wholding = false
	}

	if p.flushing {
		goto Flush
	}

	if !p.on && p.handoff == 0 {
		return nil
	}

	// Wait for new log.
	notetsleepg(&p.wait, -1)
	noteclear(&p.wait)

	switch n := p.handoff; {
	case n == 0:
		print("runtime: phase error during cpu profile wait\n")
		return nil
	case n == 0x80000000:
		p.flushing = true
		goto Flush
	default:
		n &^= 0x80000000

		// Return new log to caller.
		p.wholding = true

		return uintptrBytes(p.log[p.wtoggle][:n])
	}

	// In flush mode.
	// Add is no longer being called. We own the log.
	// Also, p.handoff is non-zero, so flushlog will return false.
	// Evict the hash table into the log and return it.
Flush:
	for i := range p.hash {
		b := &p.hash[i]
		for j := range b.entry {
			e := &b.entry[j]
			if e.count > 0 && !p.evict(e, p.flushlog) {
				// Filled the log. Stop the loop and return what we've got.
				break Flush
			}
		}
	}

	// Return pending log data.
	if p.nlog > 0 {
		// Note that we're using toggle now, not wtoggle,
		// because we're working on the log directly.
		n := p.nlog
		p.nlog = 0
		return uintptrBytes(p.log[p.toggle][:n])
	}

	// Made it through the table without finding anything to log.
	if !p.eodSent {
		// We may not have space to append this to the partial log buf,
		// so we always return a new slice for the end-of-data marker.
		p.eodSent = true
		return uintptrBytes(eod[:])
	}

	// Finally done. Clean up and return nil.
	p.flushing = false
	if !atomic.Cas(&p.handoff, p.handoff, 0) {
		print("runtime: profile flush racing with something\n")
	}
	return nil
}

func uintptrBytes(p []uintptr) (ret []byte) {
	pp := (*slice)(unsafe.Pointer(&p))
	rp := (*slice)(unsafe.Pointer(&ret))

	rp.array = pp.array
	rp.len = pp.len * int(unsafe.Sizeof(p[0]))
	rp.cap = rp.len

	return
}

// CPUProfile returns the next chunk of binary CPU profiling stack trace data,
// blocking until data is available. If profiling is turned off and all the profile
// data accumulated while it was on has been returned, CPUProfile returns nil.
// The caller must save the returned data before calling CPUProfile again.
//
// Most clients should use the runtime/pprof package or
// the testing package's -test.cpuprofile flag instead of calling
// CPUProfile directly.
func CPUProfile() []byte {
	return cpuprof.getprofile()
}

//go:linkname runtime_pprof_runtime_cyclesPerSecond runtime/pprof.runtime_cyclesPerSecond
func runtime_pprof_runtime_cyclesPerSecond() int64 {
	return tickspersecond()
}