Skip to content

Commit fb8e194

Browse files
committed
Add GPU work period support for XE driver
Signed-off-by: Aakash Sarkar <aakash.deep.sarkar@intel.com>
1 parent afd36ec commit fb8e194

11 files changed

Lines changed: 502 additions & 0 deletions

drivers/gpu/drm/xe/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ xe-y += xe_bb.o \
3838
xe_exec_queue.o \
3939
xe_force_wake.o \
4040
xe_ggtt.o \
41+
xe_gpu_work.o \
4142
xe_gpu_scheduler.o \
4243
xe_gsc.o \
4344
xe_gsc_debugfs.o \

drivers/gpu/drm/xe/xe_exec_queue.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,12 @@ struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *v
159159
if (err)
160160
goto err_post_alloc;
161161

162+
/* GPU work period record */
163+
q->record.last_ts = 0;
164+
atomic64_set(&q->record.start_time_ns, 0);
165+
INIT_LIST_HEAD(&q->record.ws_link);
166+
spin_lock_init(&q->record.lock);
167+
162168
return q;
163169

164170
err_post_alloc:

drivers/gpu/drm/xe/xe_exec_queue_types.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,20 @@ struct xe_exec_queue {
128128
struct list_head link;
129129
} lr;
130130

131+
/** @record: gpu work record for this queue */
132+
struct work_record {
133+
/* start time can be accessed b/w irq and thread
134+
* contexts. So keep it atomic
135+
*/
136+
atomic64_t start_time_ns;
137+
/* timestamp */
138+
u32 last_ts;
139+
/* link to jobs list */
140+
struct list_head ws_link;
141+
/* lock protecting this record */
142+
spinlock_t lock;
143+
} record;
144+
131145
/** @ops: submission backend exec queue operations */
132146
const struct xe_exec_queue_ops *ops;
133147

drivers/gpu/drm/xe/xe_gpu_work.c

Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
#include "xe_gpu_work.h"
2+
#include <linux/pid.h>
3+
#include <linux/errno.h>
4+
#include <linux/jiffies.h>
5+
6+
#include "xe_exec_queue.h"
7+
#include "xe_hw_engine.h"
8+
#include "xe_lrc.h"
9+
#include "xe_device.h"
10+
#include "xe_gt.h"
11+
#include "xe_gt_clock.h"
12+
13+
#define CREATE_TRACE_POINTS
14+
#include "xe_power_gpu_work_period_trace.h"
15+
16+
static inline u32 get_stats_uid(s32 key, struct xe_work_stats *stats)
17+
{
18+
struct xe_work_stats *stat = &stats[key];
19+
return READ_ONCE(stat->uid);
20+
}
21+
22+
static int get_uid_queue(struct xe_exec_queue *q)
23+
{
24+
struct xe_file *xef = NULL;
25+
struct pid *pid = NULL;
26+
struct task_struct *task = NULL;
27+
const struct cred *cred = NULL;
28+
int ret;
29+
30+
if (!q->vm || !q->vm->xef)
31+
return -EINVAL;
32+
33+
xef = xe_file_get(q->vm->xef);
34+
if (!xef) {
35+
ret = -EINVAL;
36+
goto out;
37+
}
38+
39+
pid = find_get_pid(xef->pid);
40+
if (!pid) {
41+
ret = -ESRCH;
42+
goto put_xef;
43+
}
44+
45+
task = get_pid_task(pid, PIDTYPE_PID);
46+
if (!task) {
47+
ret = -EINVAL;
48+
goto put_pid;
49+
}
50+
51+
cred = get_task_cred(task);
52+
if (!cred) {
53+
ret = -EINVAL;
54+
goto put_task;
55+
}
56+
57+
const unsigned int uid = cred->euid.val;
58+
ret = (int)uid;
59+
60+
put_cred(cred);
61+
put_task:
62+
put_task_struct(task);
63+
put_pid:
64+
put_pid(pid);
65+
put_xef:
66+
xe_file_put(xef);
67+
out:
68+
return ret;
69+
}
70+
71+
static void __emit_work_period_event(struct xe_work_stats *stat, bool discard)
72+
{
73+
struct xe_exec_queue *q = NULL, *q2 = NULL;
74+
75+
BUG_ON(!stat->uid);
76+
77+
lockdep_assert_held(&stat->lock);
78+
79+
if (STAT_INVALID(stat))
80+
discard = true;
81+
82+
if (!discard) {
83+
trace_gpu_work_period(stat->gpu_id, stat->uid,
84+
stat->start_time_ns, stat->end_time_ns,
85+
stat->active_duration_ns);
86+
}
87+
88+
/* clean up the slate */
89+
/* We keep the uid and the end time intact since we
90+
may encounter the same uid again soon */
91+
stat->start_time_ns = 0;
92+
stat->active_duration_ns = 0;
93+
stat->jiffies = 0;
94+
95+
/* Remove all the contexts associated with this uid and drop their
96+
* reference
97+
*/
98+
list_for_each_entry_safe(q, q2, &stat->queues, record.ws_link) {
99+
list_del_init(&q->record.ws_link);
100+
xe_exec_queue_put(q);
101+
}
102+
smp_mb();
103+
}
104+
105+
static void emit_work_period_event(struct xe_work_stats *stat)
106+
{
107+
lockdep_assert_held(&stat->lock);
108+
109+
u64 start_time = stat->start_time_ns;
110+
u64 end_time = stat->end_time_ns;
111+
112+
/* Google requirement restricts the interval between end time
113+
* and start time to be at most 1 second
114+
*/
115+
bool discard = ((end_time - start_time) >
116+
GPU_WORK_TIME_GAP_LIMIT_NS);
117+
118+
__emit_work_period_event(stat, discard);
119+
}
120+
121+
static void emit_event_and_evict_slot(struct xe_work_stats *stat)
122+
{
123+
lockdep_assert_held(&stat->lock);
124+
125+
u64 start_time = stat->start_time_ns;
126+
u64 end_time = stat->end_time_ns;
127+
128+
/* Google requirement restricts the interval between end time
129+
* and start time to be at most 1 second
130+
*/
131+
bool discard = ((end_time - start_time) >
132+
GPU_WORK_TIME_GAP_LIMIT_NS);
133+
stat->uid = 0;
134+
stat->end_time_ns = 0;
135+
__emit_work_period_event(stat, discard);
136+
}
137+
138+
static inline u32 get_cur_dt(struct xe_exec_queue* q)
139+
{
140+
struct xe_lrc *lrc;
141+
142+
spin_lock(&q->record.lock);
143+
lrc = q->lrc[0];
144+
u32 ts = xe_lrc_ctx_timestamp(lrc);
145+
s32 dt = ts - q->record.last_ts;
146+
q->record.last_ts = ts;
147+
spin_unlock(&q->record.lock);
148+
149+
if (unlikely(dt < 0))
150+
dt = 0;
151+
return dt * q->width;
152+
}
153+
154+
static u64 get_active_duration_ns(struct xe_exec_queue* q)
155+
{
156+
struct xe_gt *gt = q->gt;
157+
u64 dur = get_cur_dt(q);
158+
return dur? xe_gt_clock_interval_to_ns(gt, dur) : dur;
159+
}
160+
161+
static int handle_collision(s32 key, struct xe_engine_work *ew,
162+
u32 uid)
163+
{
164+
struct xe_work_stats * const stats = &ew->stats[0];
165+
u32 count = 0;
166+
167+
BUG_ON(KEY_INVALID(key));
168+
169+
while (get_stats_uid(key, stats) != uid) {
170+
if (unlikely(count >= XE_ENGINE_WORK_STATS_COUNT)) {
171+
return -ENOENT;
172+
}
173+
key++;
174+
if (key == XE_ENGINE_WORK_STATS_COUNT)
175+
key = 0;
176+
count++;
177+
}
178+
return key;
179+
}
180+
181+
static int find_next_available_slot(int key, struct xe_engine_work *ew)
182+
{
183+
return handle_collision(key, ew, 0);
184+
}
185+
186+
void xe_gpu_work_process_queue(struct xe_exec_queue *q,
187+
struct xe_engine_work *ew)
188+
{
189+
struct xe_work_stats *stat = NULL;
190+
s32 key = 0, uid = 0, cur_uid = 0;
191+
192+
if (!ew->enabled)
193+
return;
194+
195+
uid = get_uid_queue(q);
196+
if (uid < 0)
197+
return;
198+
199+
key = HASH_MAP(uid);
200+
cur_uid = get_stats_uid(key, ew->stats);
201+
202+
if (unlikely(cur_uid && cur_uid != uid)) {
203+
/*
204+
* We have encountered a hash collision.
205+
* First check if the uid is already present in another
206+
* slot by doing a linear search
207+
*/
208+
key = handle_collision(key, ew, uid);
209+
/*
210+
* We couldn't find the uid in the stats array
211+
* this means this is the first occurence of this
212+
* uid. So we find the next available slot
213+
*/
214+
if (KEY_INVALID(key))
215+
key = find_next_available_slot(key, ew);
216+
217+
/*
218+
* This can only happen if all the slots in our stats
219+
* array are occupied. Emit the event and evict one slot.
220+
*/
221+
if (KEY_INVALID(key)) {
222+
u32 idx = HASH_MAP(uid);
223+
stat = &ew->stats[idx];
224+
spin_lock(&stat->lock);
225+
emit_event_and_evict_slot(stat);
226+
spin_unlock(&stat->lock);
227+
key = idx;
228+
}
229+
}
230+
stat = &ew->stats[key];
231+
BUG_ON(stat->uid && (stat->uid != uid));
232+
u64 job_start_time =
233+
atomic64_read(&q->record.start_time_ns);
234+
235+
/*
236+
* If the uid at our hash index is empty (zero)
237+
* this implies that our ctx is processed for
238+
* the first time.
239+
*
240+
* So, we set the start time to the last time this
241+
* ctx was put into the active queue after emitting
242+
* its event. We also set the total active duration to
243+
* the current runtime of this ctx
244+
*/
245+
spin_lock(&stat->lock);
246+
if (!stat->uid) {
247+
stat->uid = uid;
248+
stat->start_time_ns = job_start_time;
249+
stat->active_duration_ns =
250+
get_active_duration_ns(q);
251+
stat->end_time_ns = ktime_get_raw_ns();
252+
253+
atomic_inc(&ew->num_entries);
254+
goto list_add;
255+
}
256+
257+
/* Google requirement prohibits next start time to
258+
* overlap with previous end time for a given uid.
259+
* Skip the reuqests that don't match the requirement
260+
* until we get the desired new start time
261+
*/
262+
u64 prev_start_time = stat->start_time_ns;
263+
u64 prev_end_time = stat->end_time_ns;
264+
if (!prev_start_time && job_start_time <= prev_end_time)
265+
goto out;
266+
267+
/*
268+
* We set the endtime to the current time this job
269+
* is being processed and accumulate the current
270+
* runtime to the total active duration
271+
*/
272+
stat->start_time_ns = prev_start_time?: job_start_time;
273+
stat->end_time_ns = ktime_get_raw_ns();
274+
stat->active_duration_ns +=
275+
get_active_duration_ns(q);
276+
277+
/* We limit the frequency of events to 10ms */
278+
unsigned long delta = jiffies - stat->jiffies;
279+
if (jiffies_to_msecs(delta) >=
280+
GPU_WORK_PERIOD_EVENT_TIMEOUT_MS)
281+
{
282+
emit_work_period_event(stat);
283+
stat->jiffies = jiffies;
284+
goto out;
285+
}
286+
287+
list_add:
288+
if (list_empty(&q->record.ws_link)) {
289+
/* This implies the queue wasn't being tracked
290+
* until this point. Get a reference and add this
291+
* to the list to mark it as being tracked.
292+
*/
293+
xe_exec_queue_get(q);
294+
list_add(&q->record.ws_link, &stat->queues);
295+
}
296+
out:
297+
spin_unlock(&stat->lock);
298+
}
299+
300+
void xe_gpu_work_stats_init(struct xe_hw_engine *engine)
301+
{
302+
struct xe_engine_work *ew = &engine->gpu_work;
303+
304+
atomic_set(&ew->num_entries, 0);
305+
306+
/* Initalize the slots */
307+
for (int i = 0; i < XE_ENGINE_WORK_STATS_COUNT; i++) {
308+
struct xe_work_stats *stat = &ew->stats[i];
309+
310+
stat->gpu_id = engine->class;
311+
stat->uid = 0;
312+
stat->start_time_ns = 0;
313+
stat->end_time_ns = 0;
314+
stat->active_duration_ns = 0;
315+
stat->jiffies = 0;
316+
317+
spin_lock_init(&stat->lock);
318+
INIT_LIST_HEAD(&stat->queues);
319+
}
320+
321+
/* Enable gpu work period */
322+
ew->enabled = true;
323+
}
324+
325+
void xe_gpu_work_stats_fini(struct xe_hw_engine *engine)
326+
{
327+
struct xe_engine_work *ew = &engine->gpu_work;
328+
329+
ew->enabled = false;
330+
if (!atomic_read(&ew->num_entries))
331+
return;
332+
333+
for (int i = 0; i < XE_ENGINE_WORK_STATS_COUNT; i++) {
334+
struct xe_work_stats *stat = &ew->stats[i];
335+
336+
if (!get_stats_uid(i, stat))
337+
continue;
338+
339+
spin_lock(&stat->lock);
340+
emit_work_period_event(stat);
341+
spin_unlock(&stat->lock);
342+
343+
if (atomic_dec_and_test(&ew->num_entries))
344+
break;
345+
}
346+
}

0 commit comments

Comments
 (0)