]> Cypherpunks.ru repositories - gostls13.git/blob - src/internal/trace/goroutinesv2.go
internal/trace: implement goroutine analysis for v2 traces
[gostls13.git] / src / internal / trace / goroutinesv2.go
1 // Copyright 2023 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package trace
6
7 import (
8         tracev2 "internal/trace/v2"
9         "io"
10         "sort"
11         "time"
12 )
13
14 // GoroutineSummary contains statistics and execution details of a single goroutine.
15 // (For v2 traces.)
16 type GoroutineSummary struct {
17         ID           tracev2.GoID
18         Name         string       // A non-unique human-friendly identifier for the goroutine.
19         PC           uint64       // The start PC of the goroutine.
20         CreationTime tracev2.Time // Timestamp of the first appearance in the trace.
21         StartTime    tracev2.Time // Timestamp of the first time it started running. 0 if the goroutine never ran.
22         EndTime      tracev2.Time // Timestamp of when the goroutine exited. 0 if the goroutine never exited.
23
24         // List of regions in the goroutine, sorted based on the start time.
25         Regions []*UserRegionSummary
26
27         // Statistics of execution time during the goroutine execution.
28         GoroutineExecStats
29
30         // goroutineSummary is state used just for computing this structure.
31         // It's dropped before being returned to the caller.
32         //
33         // More specifically, if it's nil, it indicates that this summary has
34         // already been finalized.
35         *goroutineSummary
36 }
37
38 // UserRegionSummary represents a region and goroutine execution stats
39 // while the region was active. (For v2 traces.)
40 type UserRegionSummary struct {
41         TaskID tracev2.TaskID
42         Name   string
43
44         // Region start event. Normally EventRegionBegin event or nil,
45         // but can be a state transition event from NotExist or Undetermined
46         // if the region is a synthetic region representing task inheritance
47         // from the parent goroutine.
48         Start *tracev2.Event
49
50         // Region end event. Normally EventRegionEnd event or nil,
51         // but can be a state transition event to NotExist if the goroutine
52         // terminated without explicitly ending the region.
53         End *tracev2.Event
54
55         GoroutineExecStats
56 }
57
58 // GoroutineExecStats contains statistics about a goroutine's execution
59 // during a period of time.
60 type GoroutineExecStats struct {
61         ExecTime          time.Duration
62         SchedWaitTime     time.Duration
63         BlockTimeByReason map[string]time.Duration
64         SyscallTime       time.Duration
65         SyscallBlockTime  time.Duration
66         RangeTime         map[string]time.Duration
67         TotalTime         time.Duration
68 }
69
70 // sub returns the stats v-s.
71 func (s GoroutineExecStats) sub(v GoroutineExecStats) (r GoroutineExecStats) {
72         r = s.clone()
73         r.ExecTime -= v.ExecTime
74         r.SchedWaitTime -= v.SchedWaitTime
75         for reason := range s.BlockTimeByReason {
76                 r.BlockTimeByReason[reason] -= v.BlockTimeByReason[reason]
77         }
78         r.SyscallTime -= v.SyscallTime
79         r.SyscallBlockTime -= v.SyscallBlockTime
80         r.TotalTime -= v.TotalTime
81         for name := range s.RangeTime {
82                 r.RangeTime[name] -= v.RangeTime[name]
83         }
84         return r
85 }
86
87 func (s GoroutineExecStats) clone() (r GoroutineExecStats) {
88         r = s
89         r.BlockTimeByReason = make(map[string]time.Duration)
90         for reason, dt := range s.BlockTimeByReason {
91                 r.BlockTimeByReason[reason] = dt
92         }
93         r.RangeTime = make(map[string]time.Duration)
94         for name, dt := range s.RangeTime {
95                 r.RangeTime[name] = dt
96         }
97         return r
98 }
99
100 // snapshotStat returns the snapshot of the goroutine execution statistics.
101 // This is called as we process the ordered trace event stream. lastTs is used
102 // to process pending statistics if this is called before any goroutine end event.
103 func (g *GoroutineSummary) snapshotStat(lastTs tracev2.Time) (ret GoroutineExecStats) {
104         ret = g.GoroutineExecStats.clone()
105
106         if g.goroutineSummary == nil {
107                 return ret // Already finalized; no pending state.
108         }
109
110         // Set the total time if necessary.
111         if g.TotalTime == 0 {
112                 ret.TotalTime = lastTs.Sub(g.CreationTime)
113         }
114
115         // Add in time since lastTs.
116         if g.lastStartTime != 0 {
117                 ret.ExecTime += lastTs.Sub(g.lastStartTime)
118         }
119         if g.lastRunnableTime != 0 {
120                 ret.SchedWaitTime += lastTs.Sub(g.lastRunnableTime)
121         }
122         if g.lastBlockTime != 0 {
123                 ret.BlockTimeByReason[g.lastBlockReason] += lastTs.Sub(g.lastBlockTime)
124         }
125         if g.lastSyscallTime != 0 {
126                 ret.SyscallTime += lastTs.Sub(g.lastSyscallTime)
127         }
128         if g.lastSyscallBlockTime != 0 {
129                 ret.SchedWaitTime += lastTs.Sub(g.lastSyscallBlockTime)
130         }
131         for name, ts := range g.lastRangeTime {
132                 ret.RangeTime[name] += lastTs.Sub(ts)
133         }
134         return ret
135 }
136
137 // finalize is called when processing a goroutine end event or at
138 // the end of trace processing. This finalizes the execution stat
139 // and any active regions in the goroutine, in which case trigger is nil.
140 func (g *GoroutineSummary) finalize(lastTs tracev2.Time, trigger *tracev2.Event) {
141         if trigger != nil {
142                 g.EndTime = trigger.Time()
143         }
144         finalStat := g.snapshotStat(lastTs)
145
146         g.GoroutineExecStats = finalStat
147
148         // System goroutines are never part of regions, even though they
149         // "inherit" a task due to creation (EvGoCreate) from within a region.
150         // This may happen e.g. if the first GC is triggered within a region,
151         // starting the GC worker goroutines.
152         if !IsSystemGoroutine(g.Name) {
153                 for _, s := range g.activeRegions {
154                         s.End = trigger
155                         s.GoroutineExecStats = finalStat.sub(s.GoroutineExecStats)
156                         g.Regions = append(g.Regions, s)
157                 }
158         }
159         *(g.goroutineSummary) = goroutineSummary{}
160 }
161
162 // goroutineSummary is a private part of GoroutineSummary that is required only during analysis.
163 type goroutineSummary struct {
164         lastStartTime        tracev2.Time
165         lastRunnableTime     tracev2.Time
166         lastBlockTime        tracev2.Time
167         lastBlockReason      string
168         lastSyscallTime      tracev2.Time
169         lastSyscallBlockTime tracev2.Time
170         lastRangeTime        map[string]tracev2.Time
171         activeRegions        []*UserRegionSummary // stack of active regions
172 }
173
174 // SummarizeGoroutines generates statistics for all goroutines in the trace.
175 func SummarizeGoroutines(trace io.Reader) (map[tracev2.GoID]*GoroutineSummary, error) {
176         // Create the analysis state.
177         b := goroutineStatsBuilder{
178                 gs:          make(map[tracev2.GoID]*GoroutineSummary),
179                 syscallingP: make(map[tracev2.ProcID]tracev2.GoID),
180                 syscallingG: make(map[tracev2.GoID]tracev2.ProcID),
181                 rangesP:     make(map[rangeP]tracev2.GoID),
182         }
183
184         // Process the trace.
185         r, err := tracev2.NewReader(trace)
186         if err != nil {
187                 return nil, err
188         }
189         for {
190                 ev, err := r.ReadEvent()
191                 if err == io.EOF {
192                         break
193                 }
194                 if err != nil {
195                         return nil, err
196                 }
197                 b.event(ev)
198         }
199         return b.finalize(), nil
200 }
201
202 // goroutineStatsBuilder constructs per-goroutine time statistics for v2 traces.
203 type goroutineStatsBuilder struct {
204         // gs contains the map of goroutine summaries we're building up to return to the caller.
205         gs map[tracev2.GoID]*GoroutineSummary
206
207         // syscallingP and syscallingG represent a binding between a P and G in a syscall.
208         // Used to correctly identify and clean up after syscalls (blocking or otherwise).
209         syscallingP map[tracev2.ProcID]tracev2.GoID
210         syscallingG map[tracev2.GoID]tracev2.ProcID
211
212         // rangesP is used for optimistic tracking of P-based ranges for goroutines.
213         //
214         // It's a best-effort mapping of an active range on a P to the goroutine we think
215         // is associated with it.
216         rangesP map[rangeP]tracev2.GoID
217
218         lastTs tracev2.Time // timestamp of the last event processed.
219         syncTs tracev2.Time // timestamp of the last sync event processed (or the first timestamp in the trace).
220 }
221
222 type rangeP struct {
223         id   tracev2.ProcID
224         name string
225 }
226
227 // event feeds a single event into the stats builder.
228 func (b *goroutineStatsBuilder) event(ev tracev2.Event) {
229         if b.syncTs == 0 {
230                 b.syncTs = ev.Time()
231         }
232         b.lastTs = ev.Time()
233
234         switch ev.Kind() {
235         // Record sync time for the RangeActive events.
236         case tracev2.EventSync:
237                 b.syncTs = ev.Time()
238
239         // Handle state transitions.
240         case tracev2.EventStateTransition:
241                 st := ev.StateTransition()
242                 switch st.Resource.Kind {
243                 // Handle goroutine transitions, which are the meat of this computation.
244                 case tracev2.ResourceGoroutine:
245                         id := st.Resource.Goroutine()
246                         old, new := st.Goroutine()
247                         if old == new {
248                                 // Skip these events; they're not telling us anything new.
249                                 break
250                         }
251
252                         // Handle transition out.
253                         g := b.gs[id]
254                         switch old {
255                         case tracev2.GoUndetermined, tracev2.GoNotExist:
256                                 g = &GoroutineSummary{ID: id, goroutineSummary: &goroutineSummary{}}
257                                 // If we're coming out of GoUndetermined, then the creation time is the
258                                 // time of the last sync.
259                                 if old == tracev2.GoUndetermined {
260                                         g.CreationTime = b.syncTs
261                                 } else {
262                                         g.CreationTime = ev.Time()
263                                 }
264                                 // The goroutine is being created, or it's being named for the first time.
265                                 g.lastRangeTime = make(map[string]tracev2.Time)
266                                 g.BlockTimeByReason = make(map[string]time.Duration)
267                                 g.RangeTime = make(map[string]time.Duration)
268
269                                 // When a goroutine is newly created, inherit the task
270                                 // of the active region. For ease handling of this
271                                 // case, we create a fake region description with the
272                                 // task id. This isn't strictly necessary as this
273                                 // goroutine may not be associated with the task, but
274                                 // it can be convenient to see all children created
275                                 // during a region.
276                                 //
277                                 // N.B. ev.Goroutine() will always be NoGoroutine for the
278                                 // Undetermined case, so this is will simply not fire.
279                                 if creatorG := b.gs[ev.Goroutine()]; creatorG != nil && len(creatorG.activeRegions) > 0 {
280                                         regions := creatorG.activeRegions
281                                         s := regions[len(regions)-1]
282                                         if s.TaskID != tracev2.NoTask {
283                                                 g.activeRegions = []*UserRegionSummary{{TaskID: s.TaskID, Start: &ev}}
284                                         }
285                                 }
286                                 b.gs[g.ID] = g
287                         case tracev2.GoRunning:
288                                 // Record execution time as we transition out of running
289                                 g.ExecTime += ev.Time().Sub(g.lastStartTime)
290                                 g.lastStartTime = 0
291                         case tracev2.GoWaiting:
292                                 // Record block time as we transition out of waiting.
293                                 if g.lastBlockTime != 0 {
294                                         g.BlockTimeByReason[g.lastBlockReason] += ev.Time().Sub(g.lastBlockTime)
295                                         g.lastBlockTime = 0
296                                 }
297                         case tracev2.GoRunnable:
298                                 // Record sched latency time as we transition out of runnable.
299                                 if g.lastRunnableTime != 0 {
300                                         g.SchedWaitTime += ev.Time().Sub(g.lastRunnableTime)
301                                         g.lastRunnableTime = 0
302                                 }
303                         case tracev2.GoSyscall:
304                                 // Record syscall execution time and syscall block time as we transition out of syscall.
305                                 if g.lastSyscallTime != 0 {
306                                         if g.lastSyscallBlockTime != 0 {
307                                                 g.SyscallBlockTime += ev.Time().Sub(g.lastSyscallBlockTime)
308                                                 g.SyscallTime += g.lastSyscallBlockTime.Sub(g.lastSyscallTime)
309                                         } else {
310                                                 g.SyscallTime += ev.Time().Sub(g.lastSyscallTime)
311                                         }
312                                         g.lastSyscallTime = 0
313                                         g.lastSyscallBlockTime = 0
314
315                                         // Clear the syscall map.
316                                         delete(b.syscallingP, b.syscallingG[id])
317                                         delete(b.syscallingG, id)
318                                 }
319                         }
320
321                         // The goroutine hasn't been identified yet. Take any stack we
322                         // can get and identify it by the bottom-most frame of that stack.
323                         if g.PC == 0 {
324                                 stk := ev.Stack()
325                                 if stk != tracev2.NoStack {
326                                         var frame tracev2.StackFrame
327                                         var ok bool
328                                         stk.Frames(func(f tracev2.StackFrame) bool {
329                                                 frame = f
330                                                 ok = true
331                                                 return false
332                                         })
333                                         if ok {
334                                                 g.PC = frame.PC
335                                                 g.Name = frame.Func
336                                         }
337                                 }
338                         }
339
340                         // Handle transition in.
341                         switch new {
342                         case tracev2.GoRunning:
343                                 // We started running. Record it.
344                                 g.lastStartTime = ev.Time()
345                                 if g.StartTime == 0 {
346                                         g.StartTime = ev.Time()
347                                 }
348                         case tracev2.GoRunnable:
349                                 g.lastRunnableTime = ev.Time()
350                         case tracev2.GoWaiting:
351                                 if st.Reason != "forever" {
352                                         g.lastBlockTime = ev.Time()
353                                         g.lastBlockReason = st.Reason
354                                         break
355                                 }
356                                 // "Forever" is like goroutine death.
357                                 fallthrough
358                         case tracev2.GoNotExist:
359                                 g.finalize(ev.Time(), &ev)
360                         case tracev2.GoSyscall:
361                                 b.syscallingP[ev.Proc()] = id
362                                 b.syscallingG[id] = ev.Proc()
363                                 g.lastSyscallTime = ev.Time()
364                         }
365
366                 // Handle procs to detect syscall blocking, which si identifiable as a
367                 // proc going idle while the goroutine it was attached to is in a syscall.
368                 case tracev2.ResourceProc:
369                         id := st.Resource.Proc()
370                         old, new := st.Proc()
371                         if old != new && new == tracev2.ProcIdle {
372                                 if goid, ok := b.syscallingP[id]; ok {
373                                         g := b.gs[goid]
374                                         g.lastSyscallBlockTime = ev.Time()
375                                         delete(b.syscallingP, id)
376                                 }
377                         }
378                 }
379
380         // Handle ranges of all kinds.
381         case tracev2.EventRangeBegin, tracev2.EventRangeActive:
382                 r := ev.Range()
383                 var g *GoroutineSummary
384                 switch r.Scope.Kind {
385                 case tracev2.ResourceGoroutine:
386                         // Simple goroutine range. We attribute the entire range regardless of
387                         // goroutine stats. Lots of situations are still identifiable, e.g. a
388                         // goroutine blocked often in mark assist will have both high mark assist
389                         // and high block times. Those interested in a deeper view can look at the
390                         // trace viewer.
391                         g = b.gs[r.Scope.Goroutine()]
392                 case tracev2.ResourceProc:
393                         // N.B. These ranges are not actually bound to the goroutine, they're
394                         // bound to the P. But if we happen to be on the P the whole time, let's
395                         // try to attribute it to the goroutine. (e.g. GC sweeps are here.)
396                         g = b.gs[ev.Goroutine()]
397                         if g != nil {
398                                 b.rangesP[rangeP{id: r.Scope.Proc(), name: r.Name}] = ev.Goroutine()
399                         }
400                 }
401                 if g == nil {
402                         break
403                 }
404                 if ev.Kind() == tracev2.EventRangeActive {
405                         if ts := g.lastRangeTime[r.Name]; ts != 0 {
406                                 g.RangeTime[r.Name] += b.syncTs.Sub(ts)
407                         }
408                         g.lastRangeTime[r.Name] = b.syncTs
409                 } else {
410                         g.lastRangeTime[r.Name] = ev.Time()
411                 }
412         case tracev2.EventRangeEnd:
413                 r := ev.Range()
414                 var g *GoroutineSummary
415                 switch r.Scope.Kind {
416                 case tracev2.ResourceGoroutine:
417                         g = b.gs[r.Scope.Goroutine()]
418                 case tracev2.ResourceProc:
419                         rp := rangeP{id: r.Scope.Proc(), name: r.Name}
420                         if goid, ok := b.rangesP[rp]; ok {
421                                 if goid == ev.Goroutine() {
422                                         // As the comment in the RangeBegin case states, this is only OK
423                                         // if we finish on the same goroutine we started on.
424                                         g = b.gs[goid]
425                                 }
426                                 delete(b.rangesP, rp)
427                         }
428                 }
429                 if g == nil {
430                         break
431                 }
432                 ts := g.lastRangeTime[r.Name]
433                 if ts == 0 {
434                         break
435                 }
436                 g.RangeTime[r.Name] += ev.Time().Sub(ts)
437                 delete(g.lastRangeTime, r.Name)
438
439         // Handle user-defined regions.
440         case tracev2.EventRegionBegin:
441                 g := b.gs[ev.Goroutine()]
442                 r := ev.Region()
443                 g.activeRegions = append(g.activeRegions, &UserRegionSummary{
444                         Name:               r.Type,
445                         TaskID:             r.Task,
446                         Start:              &ev,
447                         GoroutineExecStats: g.snapshotStat(ev.Time()),
448                 })
449         case tracev2.EventRegionEnd:
450                 g := b.gs[ev.Goroutine()]
451                 r := ev.Region()
452                 var sd *UserRegionSummary
453                 if regionStk := g.activeRegions; len(regionStk) > 0 {
454                         // Pop the top region from the stack since that's what must have ended.
455                         n := len(regionStk)
456                         sd = regionStk[n-1]
457                         regionStk = regionStk[:n-1]
458                         g.activeRegions = regionStk
459                 } else {
460                         // This is an "end" without a start. Just fabricate the region now.
461                         sd = &UserRegionSummary{Name: r.Type, TaskID: r.Task}
462                 }
463                 sd.GoroutineExecStats = g.snapshotStat(ev.Time()).sub(sd.GoroutineExecStats)
464                 sd.End = &ev
465                 g.Regions = append(g.Regions, sd)
466         }
467 }
468
469 // finalize indicates to the builder that we're done processing the trace.
470 // It cleans up any remaining state and returns the full summary.
471 func (b *goroutineStatsBuilder) finalize() map[tracev2.GoID]*GoroutineSummary {
472         for _, g := range b.gs {
473                 g.finalize(b.lastTs, nil)
474
475                 // Sort based on region start time.
476                 sort.Slice(g.Regions, func(i, j int) bool {
477                         x := g.Regions[i].Start
478                         y := g.Regions[j].Start
479                         if x == nil {
480                                 return true
481                         }
482                         if y == nil {
483                                 return false
484                         }
485                         return x.Time() < y.Time()
486                 })
487                 g.goroutineSummary = nil
488         }
489         return b.gs
490 }
491
492 // RelatedGoroutinesV2 finds a set of goroutines related to goroutine goid for v2 traces.
493 // The association is based on whether they have synchronized with each other in the Go
494 // scheduler (one has unblocked another).
495 func RelatedGoroutinesV2(trace io.Reader, goid tracev2.GoID) (map[tracev2.GoID]struct{}, error) {
496         r, err := tracev2.NewReader(trace)
497         if err != nil {
498                 return nil, err
499         }
500         // Process all the events, looking for transitions of goroutines
501         // out of GoWaiting. If there was an active goroutine when this
502         // happened, then we know that active goroutine unblocked another.
503         // Scribble all these down so we can process them.
504         type unblockEdge struct {
505                 operator tracev2.GoID
506                 operand  tracev2.GoID
507         }
508         var unblockEdges []unblockEdge
509         for {
510                 ev, err := r.ReadEvent()
511                 if err == io.EOF {
512                         break
513                 }
514                 if err != nil {
515                         return nil, err
516                 }
517                 if ev.Goroutine() == tracev2.NoGoroutine {
518                         continue
519                 }
520                 if ev.Kind() != tracev2.EventStateTransition {
521                         continue
522                 }
523                 st := ev.StateTransition()
524                 if st.Resource.Kind != tracev2.ResourceGoroutine {
525                         continue
526                 }
527                 id := st.Resource.Goroutine()
528                 old, new := st.Goroutine()
529                 if old == new || old != tracev2.GoWaiting {
530                         continue
531                 }
532                 unblockEdges = append(unblockEdges, unblockEdge{
533                         operator: ev.Goroutine(),
534                         operand:  id,
535                 })
536         }
537         // Compute the transitive closure of depth 2 of goroutines that have unblocked each other
538         // (starting from goid).
539         gmap := make(map[tracev2.GoID]struct{})
540         gmap[goid] = struct{}{}
541         for i := 0; i < 2; i++ {
542                 // Copy the map.
543                 gmap1 := make(map[tracev2.GoID]struct{})
544                 for g := range gmap {
545                         gmap1[g] = struct{}{}
546                 }
547                 for _, edge := range unblockEdges {
548                         if _, ok := gmap[edge.operand]; ok {
549                                 gmap1[edge.operator] = struct{}{}
550                         }
551                 }
552                 gmap = gmap1
553         }
554         return gmap, nil
555 }