Hi,大家好,我是编程小6,很荣幸遇见你,我把这些年在开发过程中遇到的问题或想法写出来,今天说一说pr_emerg耗时,影响性能原理排查,希望能够帮助你!!!。
本文涉及到的性能debug方式如下:
“在调试 vdsp的dvfs,发现notify执行dvfs callback的时候callback中执行会很慢,两条log之间什么都不作,耗时都可能在8ms左右,固定cpu/ddr频率也不没效果。”
debug思路如下:
从camera owner提供的log来看,可以确定是pr_emerg答应log方式导致耗时较长,去掉这个log也验证了是其导致的。这一个类型log一般使用在kernel panic或者串口log中。按照之前的项目经验才确定是这种导致的性能下降,但是一直没有去深究为何?
在任意位置添加如下的log:
trace_printk("samarxie 11111\n");
pr_emerg("samarxie func=%s enter\n",__func__);
trace_printk("samarxie 22222\n");
pr_err("samarxie pr_err====\n");
trace_printk("samarxie 33333\n");
pr_emerg("samarxie func=%s exit\n",__func__);
trace_printk("samarxie 44444\n");
只要编译KO或者bootimage刷入手机看log输出即可。
正常情况是所有log出现的时间顺序应该间隔非常短,在us级别.但是从正常的log看,信息如下:
<0>[ 373.701444] c0 samarxie func=dbs_check_cpu_xxx enter
<3>[ 373.706543] c0 samarxie pr_err====
<0>[ 373.706565] c0 samarxie func=dbs_check_cpu_xxx exit
可以看到
pr_emerg("samarxie func=%s enter\n",__func__);
pr_err("samarxie pr_err====\n");
这两个log时间耗时很长.
从trace可以看到如下信息:
sh-8278 [000] ..s. 373.701427: dbs_check_cpu_xxx: samarxie 11111
sh-8278 [000] ..s. 373.706538: dbs_check_cpu_xxx: samarxie 22222
也是耗时这么长时间。
所以可以确定是pr_emerg的使用导致耗时过长,并且会影响性能.但是为何呢?pr_emerg是log level为0,而我们平台的log level=1就会往串口输出数据。
#define pr_emerg(fmt, ...) \ printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define KERN_EMERG KERN_SOH "0" /* system is unusable */
我继续check到底是哪里耗时这么长呢?开启lock trace debug feature,具体开启如下的config:
#
# Lock Debugging (spinlocks, mutexes, etc...)
#
-# CONFIG_DEBUG_RT_MUTEXES is not set
-# CONFIG_DEBUG_SPINLOCK is not set
-# CONFIG_DEBUG_MUTEXES is not set
-# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set
-# CONFIG_DEBUG_LOCK_ALLOC is not set
-# CONFIG_PROVE_LOCKING is not set
-# CONFIG_LOCK_STAT is not set
+CONFIG_DEBUG_RT_MUTEXES=y
+CONFIG_DEBUG_SPINLOCK=y
+CONFIG_DEBUG_MUTEXES=y
+CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_LOCKDEP=y
+CONFIG_LOCK_STAT=y
+CONFIG_DEBUG_LOCKDEP=y
开启上面的之后,我们就可以trace lock的情况了.
在sys/kernel/debug/tracing/event/lock/目录下存在如下的event:
/sys/kernel/debug/tracing/events/lock # ls -l
total 0
-rw-r--r-- 1 root root 0 1970-01-01 08:00 enable
-rw-r--r-- 1 root root 0 1970-01-01 08:00 filter
drwxr-xr-x 2 root root 0 1970-01-01 08:00 lock_acquire
drwxr-xr-x 2 root root 0 1970-01-01 08:00 lock_acquired
drwxr-xr-x 2 root root 0 1970-01-01 08:00 lock_contended
drwxr-xr-x 2 root root 0 1970-01-01 08:00 lock_release
之后使用如下的ftrace脚本,抓取复现场景的信息即可:
#!/bin/bash
adb root
adb remount
#low memory,buffer size should be little. otherwise lose trace event
adb shell "echo 60000 > /d/tracing/buffer_size_kb"
#adb shell "echo sched_switch sched_wakeup sched_waking > /d/tracing/set_event"
adb shell "echo lock> /d/tracing/set_event"
adb shell "echo > /d/tracing/trace"
adb shell "echo 0 > /d/tracing/tracing_on"
sleep 1
echo "start fetch trace informantion $1s"
adb shell "echo 1 > /d/tracing/tracing_on"
#set fetch trace timeout
echo "do anything"
sleep $1
adb shell "echo 0 > /d/tracing/tracing_on"
echo "fetch trace informantion end"
echo "save trace to /data/trace.txt file"
adb shell "cat /d/tracing/trace > /data/trace.txt"
echo "all fetch over!"
echo "start pull file in current dir"
time=`date '+%T'`
adb pull /data/trace.txt $time-trace.txt
echo "pull file name is $time-trace.txt"
就可以看到,是console_lock一直持锁导致的:
sh-8278 [000] ..s. 373.701427: dbs_check_cpu_xxx: samarxie 11111
<0>[ 373.701444] c0 samarxie func=dbs_check_cpu_xxx enter
sh-8278 [000] d.s1 373.701477: lock_acquire: 00000000707bef2b console_owner
sh-8278 [000] d.s1 373.706452: lock_release: 00000000707bef2b console_owner
sh-8278 [000] ..s. 373.706538: dbs_check_cpu_xxx: samarxie 22222
<3>[ 373.706543] c0 samarxie pr_err====
sh-8278 [000] ..s. 373.706560: dbs_check_cpu_xxx: samarxie 33333
<0>[ 373.706565] c0 samarxie func=dbs_check_cpu_xxx exit
sh-8278 [000] d.s1 373.706584: lock_acquire: 00000000707bef2b console_owner
sh-8278 [000] d.s1 373.711450: lock_release: 00000000707bef2b console_owner
sh-8278 [000] ..s. 373.711619: dbs_check_cpu_xxx: samarxie 44444
从上面可以知道:
继续定位为何持锁这么长时间:
static struct lockdep_map console_owner_dep_map = {
.name = "console_owner"
};
/** * console_lock_spinning_enable - mark beginning of code where another * thread might safely busy wait * * This basically converts console_lock into a spinlock. This marks * the section where the console_lock owner can not sleep, because * there may be a waiter spinning (like a spinlock). Also it must be * ready to hand over the lock at the end of the section. */
static void console_lock_spinning_enable(void)
{
raw_spin_lock(&console_owner_lock);
console_owner = current;
raw_spin_unlock(&console_owner_lock);
/* The waiter may spin on us after setting console_owner */
spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
}
/** * console_lock_spinning_disable_and_check - mark end of code where another * thread was able to busy wait and check if there is a waiter * * This is called at the end of the section where spinning is allowed. * It has two functions. First, it is a signal that it is no longer * safe to start busy waiting for the lock. Second, it checks if * there is a busy waiter and passes the lock rights to her. * * Important: Callers lose the lock if there was a busy waiter. * They must not touch items synchronized by console_lock * in this case. * * Return: 1 if the lock rights were passed, 0 otherwise. */
static int console_lock_spinning_disable_and_check(void)
{
int waiter;
raw_spin_lock(&console_owner_lock);
waiter = READ_ONCE(console_waiter);
console_owner = NULL;
raw_spin_unlock(&console_owner_lock);
if (!waiter) {
spin_release(&console_owner_dep_map, 1, _THIS_IP_);
return 0;
}
/* The waiter is now free to continue */
WRITE_ONCE(console_waiter, false);
spin_release(&console_owner_dep_map, 1, _THIS_IP_);
/* * Hand off console_lock to waiter. The waiter will perform * the up(). After this, the waiter is the console_lock owner. */
mutex_release(&console_lock_dep_map, 1, _THIS_IP_);
return 1;
}
//在函数console_unlock中调用如下:
/* * While actively printing out messages, if another printk() * were to occur on another CPU, it may wait for this one to * finish. This task can not be preempted if there is a * waiter waiting to take over. */
console_lock_spinning_enable();
stop_critical_timings(); /* don't trace print latency */
call_console_drivers(ext_text, ext_len, text, len);
start_critical_timings();
if (console_lock_spinning_disable_and_check()) {
printk_safe_exit_irqrestore(flags);
goto out;
}
继续定位耗时的操作为call_console_drivers函数. 那么到底耗时操作在哪里呢?定位函数耗时操作,可以抓取trace function graphic信息,脚本如下:
#!/bin/bash
adb root
adb remount
#low memory,buffer size should be little. otherwise lose trace event
adb shell "echo 60000 > /d/tracing/buffer_size_kb"
#adb shell "echo sched_switch sched_wakeup sched_waking > /d/tracing/set_event"
#adb shell "echo lock> /d/tracing/set_event"
adb shell "echo nop > /d/tracing/set_event"
adb shell "echo > /d/tracing/trace"
adb shell "echo function_graph > /d/tracing/current_tracer"
adb shell "echo console_unlock > /d/tracing/set_graph_function"
adb shell "echo 0 > /d/tracing/tracing_on"
sleep 1
echo "start fetch trace informantion $1s"
adb shell "echo 1 > /d/tracing/tracing_on"
#set fetch trace timeout
echo "do anything"
sleep $1
adb shell "echo 0 > /d/tracing/tracing_on"
echo "fetch trace informantion end"
echo "save trace to /data/trace.txt file"
adb shell "cat /d/tracing/trace > /data/trace.txt"
echo "all fetch over!"
echo "start pull file in current dir"
time=`date '+%T'`
adb pull /data/trace.txt func-graphic-$time-trace.txt
echo "pull file name is func-graphic-$time-trace.txt"
具体各个函数耗时如下(trace中获取的):
0) | console_unlock() {
0) | __printk_safe_enter() {
0) 1.192 us | preempt_count_add();
0) 0.384 us | preempt_count_sub();
0) + 12.616 us | }
0) | _raw_spin_lock() {
0) 0.384 us | preempt_count_add();
0) 0.423 us | do_raw_spin_trylock();
0) 9.000 us | }
0) | msg_print_text() {
0) + 11.000 us | print_prefix();
0) 4.962 us | print_prefix();
0) + 24.692 us | }
0) | _raw_spin_unlock() {
0) 0.462 us | do_raw_spin_unlock();
0) 0.385 us | preempt_count_sub();
0) 7.961 us | }
0) | console_lock_spinning_enable() {
//console_owner加锁
0) | _raw_spin_lock() {
0) 0.385 us | preempt_count_add();
0) 0.423 us | do_raw_spin_trylock();
0) 8.154 us | }
0) | _raw_spin_unlock() {
0) 0.462 us | do_raw_spin_unlock();
0) 0.385 us | preempt_count_sub();
0) 7.885 us | }
0) + 25.077 us | }
0) | /* samarxie call_console_drivers start */
0) | xxx_console_write() {
0) | _raw_spin_lock_irqsave() {
0) 0.384 us | preempt_count_add();
0) 2.038 us | do_raw_spin_trylock();
0) 9.654 us | }
0) | uart_console_write() {
0) | xxx_console_putchar() {
0) 1.500 us | __delay();
0) 6.808 us | }
0) | xxx_console_putchar() {
0) 1.462 us | __delay();
0) 5.461 us | }
0) | xxx_console_putchar() {
//耗时函数
0) 1.500 us | __delay();
0) 1.423 us | __delay();
0) 1.500 us | __delay();
0) 1.423 us | __delay();
0) 1.462 us | __delay();
0) 1.423 us | __delay();
0) 1.423 us | __delay();
0) 1.423 us | __delay();
0) 1.500 us | __delay();
0) 1.423 us | __delay();
0) 1.461 us | __delay();
0) 1.423 us | __delay();
0) 1.500 us | __delay();
0) 1.423 us | __delay();
0) 1.500 us | __delay();
0) 1.500 us | __delay();
0) + 80.808 us | }
0) | xxx_console_putchar() {
0) 1.500 us | __delay();
0) 1.500 us | __delay();
0) 1.423 us | __delay();
0) 1.423 us | __delay();
0) 1.423 us | __delay();
0) 1.500 us | __delay();
0) 1.500 us | __delay();
0) 1.462 us | __delay();
0) 1.423 us | __delay();
0) 1.500 us | __delay();
0) 1.423 us | __delay();
0) 1.423 us | __delay();
0) 1.462 us | __delay();
0) 1.423 us | __delay();
0) 1.461 us | __delay();
0) + 86.731 us | }
……………………………………………….
0) 1.500 us | __delay();
0) 1.423 us | __delay();
0) | _raw_spin_unlock_irqrestore() {
0) 0.462 us | do_raw_spin_unlock();
0) 0.385 us | preempt_count_sub();
0) 7.961 us | }
0) # 4977.385 us | }
0) | pstore_console_write() {
0) | __getnstimeofday64() {
0) 0.500 us | arch_counter_read();
0) 6.231 us | }
0) | buffer_size_add() {
0) | _raw_spin_lock_irqsave() {
0) 0.423 us | preempt_count_add();
0) 0.462 us | do_raw_spin_trylock();
0) 8.961 us | }
0) | _raw_spin_unlock_irqrestore() {
0) 0.462 us | do_raw_spin_unlock();
0) 0.385 us | preempt_count_sub();
0) 8.346 us | }
0) + 26.885 us | }
0) | buffer_start_add() {
0) | _raw_spin_lock_irqsave() {
0) 0.423 us | preempt_count_add();
0) 0.423 us | do_raw_spin_trylock();
0) 9.154 us | }
0) | _raw_spin_unlock_irqrestore() {
0) 0.461 us | do_raw_spin_unlock();
0) 0.385 us | preempt_count_sub();
0) 7.923 us | }
0) + 25.462 us | }
0) 1.385 us | __memcpy_toio();
0) + 88.423 us | }
0) | /* samarxie call_console_drivers end */
0) | console_lock_spinning_disable_and_check() {
//console_owner解锁
可以看到是xxx_console_putchar耗时过长.
看起代码:
static void wait_for_xmitr(struct uart_port *port)
{
unsigned int status, tmout = 10000;
/* wait up to 10ms for the character(s) to be sent */
do {
status = serial_in(port, XXX_STS1);
if (--tmout == 0)
break;
udelay(1); //trace中看到的udelay执行
} while (status & xxx_TX_FIFO_CNT_MASK);
}
static void xxx_console_putchar(struct uart_port *port, int ch)
{
wait_for_xmitr(port);
serial_out(port, xxx_TXD, ch);
}
就算做实验把udelay去掉,耗时也没有减少,可以看到串口读写操作耗时多。
本身耗时在硬件的行为。
所以概述的措施可能有:
最后,log的正确合理的使用对性能的影响还是蛮关键的。
今天的分享到此就结束了,感谢您的阅读,如果确实帮到您,您可以动动手指转发给其他人。
上一篇
已是最后文章
下一篇
已是最新文章