xref: /linux/tools/workqueue/wq_monitor.py (revision 58f6259b7a08f8d47d4629609703d358b042f0fd)
1#!/usr/bin/env drgn
2#
3# Copyright (C) 2023 Tejun Heo <tj@kernel.org>
4# Copyright (C) 2023 Meta Platforms, Inc. and affiliates.
5
6desc = """
7This is a drgn script to monitor workqueues. For more info on drgn, visit
8https://github.com/osandov/drgn.
9
10  total    Total number of work items executed by the workqueue.
11
12  infl     The number of currently in-flight work items.
13
14  CPUtime  Total CPU time consumed by the workqueue in seconds. This is
15           sampled from scheduler ticks and only provides ballpark
16           measurement. "nohz_full=" CPUs are excluded from measurement.
17
18  CPUitsv  The number of times a concurrency-managed work item hogged CPU
19           longer than the threshold (workqueue.cpu_intensive_thresh_us)
20           and got excluded from concurrency management to avoid stalling
21           other work items.
22
23  CMwake   The number of concurrency-management wake-ups while executing a
24           work item of the workqueue.
25
26  mayday   The number of times the rescuer was requested while waiting for
27           new worker creation.
28
29  rescued  The number of work items executed by the rescuer.
30"""
31
32import sys
33import signal
34import os
35import re
36import time
37import json
38
39import drgn
40from drgn.helpers.linux.list import list_for_each_entry,list_empty
41from drgn.helpers.linux.cpumask import for_each_possible_cpu
42
43import argparse
44parser = argparse.ArgumentParser(description=desc,
45                                 formatter_class=argparse.RawTextHelpFormatter)
46parser.add_argument('workqueue', metavar='REGEX', nargs='*',
47                    help='Target workqueue name patterns (all if empty)')
48parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1,
49                    help='Monitoring interval (0 to print once and exit)')
50parser.add_argument('-j', '--json', action='store_true',
51                    help='Output in json')
52args = parser.parse_args()
53
54def err(s):
55    print(s, file=sys.stderr, flush=True)
56    sys.exit(1)
57
58workqueues              = prog['workqueues']
59
60WQ_UNBOUND              = prog['WQ_UNBOUND']
61WQ_MEM_RECLAIM          = prog['WQ_MEM_RECLAIM']
62
63PWQ_STAT_STARTED        = prog['PWQ_STAT_STARTED']      # work items started execution
64PWQ_STAT_COMPLETED      = prog['PWQ_STAT_COMPLETED']	# work items completed execution
65PWQ_STAT_CPU_TIME       = prog['PWQ_STAT_CPU_TIME']     # total CPU time consumed
66PWQ_STAT_CPU_INTENSIVE  = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations
67PWQ_STAT_CM_WAKEUP      = prog['PWQ_STAT_CM_WAKEUP']    # concurrency-management worker wakeups
68PWQ_STAT_MAYDAY         = prog['PWQ_STAT_MAYDAY']	# maydays to rescuer
69PWQ_STAT_RESCUED        = prog['PWQ_STAT_RESCUED']	# linked work items executed by rescuer
70PWQ_NR_STATS            = prog['PWQ_NR_STATS']
71
72class WqStats:
73    def __init__(self, wq):
74        self.name = wq.name.string_().decode()
75        self.unbound = wq.flags & WQ_UNBOUND != 0
76        self.mem_reclaim = wq.flags & WQ_MEM_RECLAIM != 0
77        self.stats = [0] * PWQ_NR_STATS
78        for pwq in list_for_each_entry('struct pool_workqueue', wq.pwqs.address_of_(), 'pwqs_node'):
79            for i in range(PWQ_NR_STATS):
80                self.stats[i] += int(pwq.stats[i])
81
82    def dict(self, now):
83        return { 'timestamp'            : now,
84                 'name'                 : self.name,
85                 'unbound'              : self.unbound,
86                 'mem_reclaim'          : self.mem_reclaim,
87                 'started'              : self.stats[PWQ_STAT_STARTED],
88                 'completed'            : self.stats[PWQ_STAT_COMPLETED],
89                 'cpu_time'             : self.stats[PWQ_STAT_CPU_TIME],
90                 'cpu_intensive'        : self.stats[PWQ_STAT_CPU_INTENSIVE],
91                 'cm_wakeup'            : self.stats[PWQ_STAT_CM_WAKEUP],
92                 'mayday'               : self.stats[PWQ_STAT_MAYDAY],
93                 'rescued'              : self.stats[PWQ_STAT_RESCUED], }
94
95    def table_header_str():
96        return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\
97            f'{"CPUitsv":>7} {"CMwake":>7} {"mayday":>7} {"rescued":>7}'
98
99    def table_row_str(self):
100        cpu_intensive = '-'
101        cm_wakeup = '-'
102        mayday = '-'
103        rescued = '-'
104
105        if not self.unbound:
106            cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE])
107            cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP])
108
109        if self.mem_reclaim:
110            mayday = str(self.stats[PWQ_STAT_MAYDAY])
111            rescued = str(self.stats[PWQ_STAT_RESCUED])
112
113        out = f'{self.name[-24:]:24} ' \
114              f'{self.stats[PWQ_STAT_STARTED]:8} ' \
115              f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
116              f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \
117              f'{cpu_intensive:>7} ' \
118              f'{cm_wakeup:>7} ' \
119              f'{mayday:>7} ' \
120              f'{rescued:>7} '
121        return out.rstrip(':')
122
123exit_req = False
124
125def sigint_handler(signr, frame):
126    global exit_req
127    exit_req = True
128
129def main():
130    # handle args
131    table_fmt = not args.json
132    interval = args.interval
133
134    re_str = None
135    if args.workqueue:
136        for r in args.workqueue:
137            if re_str is None:
138                re_str = r
139            else:
140                re_str += '|' + r
141
142    filter_re = re.compile(re_str) if re_str else None
143
144    # monitoring loop
145    signal.signal(signal.SIGINT, sigint_handler)
146
147    while not exit_req:
148        now = time.time()
149
150        if table_fmt:
151            print()
152            print(WqStats.table_header_str())
153
154        for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'):
155            stats = WqStats(wq)
156            if filter_re and not filter_re.search(stats.name):
157                continue
158            if table_fmt:
159                print(stats.table_row_str())
160            else:
161                print(stats.dict(now))
162
163        if interval == 0:
164            break
165        time.sleep(interval)
166
167if __name__ == "__main__":
168    main()
169