lock.go
Go Source File · 143 lines
package collector
import ( "fmt" "time"
"github.com/cilium/ebpf/link" "github.com/cilium/ebpf/rlimit"
"github.com/os2026/ebpf-rca/internal/ksym")
const maxStackDepth = 32
// lockStat 与 lock.bpf.c 中的 struct lock_stat 二进制布局一致。type lockStat struct { OffcpuNs uint64 OffcpuCount uint64 MaxOffcpuNs uint64 LastWaker uint32 StackID int32 Comm [16]byte}
// LockSample 是单个线程在一个窗口内的 off-CPU 阻塞派生指标。type LockSample struct { Pid uint32 Comm string OffcpuRatio float64 // 阻塞型 off-CPU 时间占墙钟比例(0..1) BlockCount uint64 // 窗口内阻塞次数 MaxOffcpuMs float64 // 单次最长阻塞(毫秒,累计最大值) LastWaker uint32 // 最近唤醒者 tid StackID int32 // 阻塞内核栈 id(用于符号化)}
// LockCollector 加载锁竞争场景的 eBPF 程序并读取 off-CPU 阻塞数据。type LockCollector struct { objs lockObjects links []link.Link ksyms *ksym.Table prev map[uint32]lockStat}
// NewLockCollector 加载字节码、挂载 tracepoint、载入内核符号表。func NewLockCollector() (*LockCollector, error) { if err := rlimit.RemoveMemlock(); err != nil { return nil, fmt.Errorf("remove memlock: %w", err) } c := &LockCollector{prev: make(map[uint32]lockStat)} if err := loadLockObjects(&c.objs, nil); err != nil { return nil, fmt.Errorf("load bpf objects: %w", err) } sw, err := link.Tracepoint("sched", "sched_switch", c.objs.HandleSwitch, nil) if err != nil { c.Close() return nil, fmt.Errorf("attach sched_switch: %w", err) } c.links = append(c.links, sw)
wk, err := link.Tracepoint("sched", "sched_wakeup", c.objs.HandleWakeup, nil) if err != nil { c.Close() return nil, fmt.Errorf("attach sched_wakeup: %w", err) } c.links = append(c.links, wk)
// 符号表载入失败不致命:仍可输出地址。 if t, err := ksym.Load(); err == nil { c.ksyms = t } return c, nil}
// Close 卸载探针并释放资源。func (c *LockCollector) Close() { for _, l := range c.links { _ = l.Close() } c.objs.Close()}
// Poll 读取 lock_stats,计算自上次调用以来的差分。func (c *LockCollector) Poll(interval time.Duration) ([]LockSample, error) { cur := make(map[uint32]lockStat) var key uint32 var val lockStat it := c.objs.LockStats.Iterate() for it.Next(&key, &val) { cur[key] = val } if err := it.Err(); err != nil { return nil, fmt.Errorf("iterate lock_stats: %w", err) }
intervalNs := float64(interval.Nanoseconds()) var samples []LockSample for tid, v := range cur { var dOff, dCount uint64 if p, ok := c.prev[tid]; ok { dOff = v.OffcpuNs - p.OffcpuNs dCount = v.OffcpuCount - p.OffcpuCount } else { dOff, dCount = v.OffcpuNs, v.OffcpuCount } if dOff == 0 && dCount == 0 { continue } samples = append(samples, LockSample{ Pid: tid, Comm: commToString(v.Comm), OffcpuRatio: float64(dOff) / intervalNs, BlockCount: dCount, MaxOffcpuMs: float64(v.MaxOffcpuNs) / 1e6, LastWaker: v.LastWaker, StackID: v.StackID, }) } c.prev = cur return samples, nil}
// ResolveStack 将阻塞栈 id 符号化为最多 max 个栈帧函数名。func (c *LockCollector) ResolveStack(id int32, max int) []string { if id < 0 { return nil } var frames [maxStackDepth]uint64 if err := c.objs.Stackmap.Lookup(uint32(id), &frames); err != nil { return nil } var out []string for _, a := range frames { if a == 0 { break } out = append(out, c.ksyms.Resolve(a)) if len(out) >= max { break } } return out}





