tracing: rework sched_preempt_disable trace point implementation

The current implementation of sched_preempt_disable trace point
fails to detect the preemption disable time inside spin_lock_bh()
and spin_unlock_bh(). This is because __local_bh_disable_ip() calls
directly __preempt_count_add() which skips the preemption disable
tracking. Instead of relying on the updates to preempt count, it
is better to write the preemption disable tracking directly to
preemptsoff tracer. This is similar to how irq disable tracking
is done.

The current code handles the false positives coming from __schedule()
by directly resetting the time stamp. This requires an interface
from the scheduler to preemptsoff tracer. To avoid this additional
interface, the current patch detects the same condition by comparing
the task pid and context switch count. If they are not matching
at the time of preemption disable to enable, don't track the preemption
disable time as it involved a context switch.

Due to this rework. the sched_preempt_disable trace point location is
changed to

/sys/kernel/debug/tracing/events/preemptirq/sched_preempt_disable/enable

Change-Id: I7f58d316b7c54bc7a54102bfeb678404bda010d4
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
This commit is contained in:
Pavankumar Kondeti 2020-01-13 10:46:41 +05:30
parent 5e3f09ce21
commit fd6298c891
6 changed files with 98 additions and 88 deletions

View File

@ -76,7 +76,7 @@ extern int sched_updown_migrate_handler(struct ctl_table *table,
size_t *lenp, loff_t *ppos);
#endif
#if defined(CONFIG_PREEMPT_TRACER) || defined(CONFIG_DEBUG_PREEMPT)
#if defined(CONFIG_PREEMPTIRQ_EVENTS) || defined(CONFIG_PREEMPT_TRACER)
extern unsigned int sysctl_preemptoff_tracing_threshold_ns;
#endif
#if defined(CONFIG_PREEMPTIRQ_EVENTS) && defined(CONFIG_IRQSOFF_TRACER)

View File

@ -90,6 +90,38 @@ TRACE_EVENT(irqs_disable,
__entry->caddr2, __entry->caddr3)
);
TRACE_EVENT(sched_preempt_disable,
TP_PROTO(u64 delta, bool irqs_disabled,
unsigned long caddr0, unsigned long caddr1,
unsigned long caddr2, unsigned long caddr3),
TP_ARGS(delta, irqs_disabled, caddr0, caddr1, caddr2, caddr3),
TP_STRUCT__entry(
__field(u64, delta)
__field(bool, irqs_disabled)
__field(void*, caddr0)
__field(void*, caddr1)
__field(void*, caddr2)
__field(void*, caddr3)
),
TP_fast_assign(
__entry->delta = delta;
__entry->irqs_disabled = irqs_disabled;
__entry->caddr0 = (void *)caddr0;
__entry->caddr1 = (void *)caddr1;
__entry->caddr2 = (void *)caddr2;
__entry->caddr3 = (void *)caddr3;
),
TP_printk("delta=%llu(ns) irqs_d=%d Callers:(%ps<-%ps<-%ps<-%ps)",
__entry->delta, __entry->irqs_disabled,
__entry->caddr0, __entry->caddr1,
__entry->caddr2, __entry->caddr3)
);
#endif /* _TRACE_PREEMPTIRQ_H */
#include <trace/define_trace.h>

View File

@ -1649,38 +1649,6 @@ TRACE_EVENT(sched_isolate,
__entry->time, __entry->isolate)
);
TRACE_EVENT(sched_preempt_disable,
TP_PROTO(u64 delta, bool irqs_disabled,
unsigned long caddr0, unsigned long caddr1,
unsigned long caddr2, unsigned long caddr3),
TP_ARGS(delta, irqs_disabled, caddr0, caddr1, caddr2, caddr3),
TP_STRUCT__entry(
__field(u64, delta)
__field(bool, irqs_disabled)
__field(void*, caddr0)
__field(void*, caddr1)
__field(void*, caddr2)
__field(void*, caddr3)
),
TP_fast_assign(
__entry->delta = delta;
__entry->irqs_disabled = irqs_disabled;
__entry->caddr0 = (void *)caddr0;
__entry->caddr1 = (void *)caddr1;
__entry->caddr2 = (void *)caddr2;
__entry->caddr3 = (void *)caddr3;
),
TP_printk("delta=%llu(ns) irqs_d=%d Callers:(%ps<-%ps<-%ps<-%ps)",
__entry->delta, __entry->irqs_disabled,
__entry->caddr0, __entry->caddr1,
__entry->caddr2, __entry->caddr3)
);
#include "walt.h"
#endif /* CONFIG_SMP */
#endif /* _TRACE_SCHED_H */

View File

@ -3353,55 +3353,17 @@ static inline void sched_tick_stop(int cpu) { }
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_TRACE_PREEMPT_TOGGLE))
/*
* preemptoff stack tracing threshold in ns.
* default: 1ms
*/
unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000UL;
struct preempt_store {
u64 ts;
unsigned long caddr[4];
bool irqs_disabled;
};
DEFINE_PER_CPU(struct preempt_store, the_ps);
/*
* This is only called from __schedule() upon context switch.
*
* schedule() calls __schedule() with preemption disabled.
* if we had entered idle and exiting idle now, reset the preemption
* tracking otherwise we may think preemption is disabled the whole time
* when the non idle task re-enables the preemption in schedule().
*/
static inline void preempt_latency_reset(void)
{
if (is_idle_task(this_rq()->curr))
this_cpu_ptr(&the_ps)->ts = 0;
}
/*
* If the value passed in is equal to the current preempt count
* then we just disabled preemption. Start timing the latency.
*/
static inline void preempt_latency_start(int val)
{
int cpu = raw_smp_processor_id();
struct preempt_store *ps = &per_cpu(the_ps, cpu);
if (preempt_count() == val) {
unsigned long ip = get_lock_parent_ip();
#ifdef CONFIG_DEBUG_PREEMPT
current->preempt_disable_ip = ip;
#endif
ps->ts = sched_clock();
ps->caddr[0] = CALLER_ADDR0;
ps->caddr[1] = CALLER_ADDR1;
ps->caddr[2] = CALLER_ADDR2;
ps->caddr[3] = CALLER_ADDR3;
ps->irqs_disabled = irqs_disabled();
trace_preempt_off(CALLER_ADDR0, ip);
}
}
@ -3435,19 +3397,6 @@ NOKPROBE_SYMBOL(preempt_count_add);
static inline void preempt_latency_stop(int val)
{
if (preempt_count() == val) {
struct preempt_store *ps = &per_cpu(the_ps,
raw_smp_processor_id());
u64 delta = ps->ts ? (sched_clock() - ps->ts) : 0;
/*
* Trace preempt disable stack if preemption
* is disabled for more than the threshold.
*/
if (delta > sysctl_preemptoff_tracing_threshold_ns)
trace_sched_preempt_disable(delta, ps->irqs_disabled,
ps->caddr[0], ps->caddr[1],
ps->caddr[2], ps->caddr[3]);
ps->ts = 0;
trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
}
}
@ -3477,7 +3426,6 @@ NOKPROBE_SYMBOL(preempt_count_sub);
#else
static inline void preempt_latency_start(int val) { }
static inline void preempt_latency_stop(int val) { }
static inline void preempt_latency_reset(void) { }
#endif
static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
@ -3701,7 +3649,6 @@ static void __sched notrace __schedule(bool preempt)
if (!prev->on_rq)
prev->last_sleep_ts = wallclock;
preempt_latency_reset();
update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
rq->nr_switches++;

View File

@ -336,7 +336,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
#if defined(CONFIG_PREEMPT_TRACER) || defined(CONFIG_DEBUG_PREEMPT)
#if defined(CONFIG_PREEMPT_TRACER) && defined(CONFIG_PREEMPTIRQ_EVENTS)
{
.procname = "preemptoff_tracing_threshold_ns",
.data = &sysctl_preemptoff_tracing_threshold_ns,

View File

@ -620,7 +620,7 @@ struct irqsoff_store {
unsigned long caddr[4];
};
DEFINE_PER_CPU(struct irqsoff_store, the_irqsoff);
static DEFINE_PER_CPU(struct irqsoff_store, the_irqsoff);
#endif /* CONFIG_PREEMPTIRQ_EVENTS */
/*
@ -703,9 +703,57 @@ static struct tracer irqsoff_tracer __read_mostly =
#endif /* CONFIG_IRQSOFF_TRACER */
#ifdef CONFIG_PREEMPT_TRACER
#ifdef CONFIG_PREEMPTIRQ_EVENTS
/*
* preemptoff stack tracing threshold in ns.
* default: 1ms
*/
unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000UL;
struct preempt_store {
u64 ts;
unsigned long caddr[4];
bool irqs_disabled;
int pid;
unsigned long ncsw;
};
static DEFINE_PER_CPU(struct preempt_store, the_ps);
#endif /* CONFIG_PREEMPTIRQ_EVENTS */
void tracer_preempt_on(unsigned long a0, unsigned long a1)
{
int pc = preempt_count();
#ifdef CONFIG_PREEMPTIRQ_EVENTS
struct preempt_store *ps;
u64 delta = 0;
lockdep_off();
ps = &per_cpu(the_ps, raw_smp_processor_id());
/*
* schedule() calls __schedule() with preemption disabled.
* if we had entered idle and exiting idle now, we think
* preemption is disabled the whole time. Detect this by
* checking if the preemption is disabled across the same
* task. There is a possiblity that the same task is scheduled
* after idle. To rule out this possibility, compare the
* context switch count also.
*/
if (ps->ts && ps->pid == current->pid && (ps->ncsw ==
current->nvcsw + current->nivcsw))
delta = sched_clock() - ps->ts;
/*
* Trace preempt disable stack if preemption
* is disabled for more than the threshold.
*/
if (delta > sysctl_preemptoff_tracing_threshold_ns)
trace_sched_preempt_disable(delta, ps->irqs_disabled,
ps->caddr[0], ps->caddr[1],
ps->caddr[2], ps->caddr[3]);
ps->ts = 0;
lockdep_on();
#endif /* CONFIG_PREEMPTIRQ_EVENTS */
if (preempt_trace(pc) && !irq_trace())
stop_critical_timing(a0, a1, pc);
@ -714,6 +762,21 @@ void tracer_preempt_on(unsigned long a0, unsigned long a1)
void tracer_preempt_off(unsigned long a0, unsigned long a1)
{
int pc = preempt_count();
#ifdef CONFIG_PREEMPTIRQ_EVENTS
struct preempt_store *ps;
lockdep_off();
ps = &per_cpu(the_ps, raw_smp_processor_id());
ps->ts = sched_clock();
ps->caddr[0] = CALLER_ADDR0;
ps->caddr[1] = CALLER_ADDR1;
ps->caddr[2] = CALLER_ADDR2;
ps->caddr[3] = CALLER_ADDR3;
ps->irqs_disabled = irqs_disabled();
ps->pid = current->pid;
ps->ncsw = current->nvcsw + current->nivcsw;
lockdep_on();
#endif /* CONFIG_PREEMPTIRQ_EVENTS */
if (preempt_trace(pc) && !irq_trace())
start_critical_timing(a0, a1, pc);