20150501 调试分析之 修改内核来定位系统僵死问题
2015-05-01 Lover雪儿
今天还是研究内核调试,
死机,这个词语,大家应该不陌生.
当我们写程序,如果加入到内核中的程序中有出现死循环的话,启动内核运行程序会直接进入相对死机状态.
那么怎么可以解决这个问题呢?
我们都知道,我们人的心脏是一直跳动的,而恰恰如此,内核也有它的跳动,那就是tick中断,
所以我们可以从tick中断入手,解决上面的死机问题.
在开发板上运行cat /proc/interrupts 可以查看系统当前的各种中断号,
可以看到一个中断名为i.MX Timer Tick 的中断,那么它就是我们今天的主角.
1 root@EasyARM-iMX257 /mnt/nfs/module# cat /proc/interrupts 2 CPU0 3 9: 0 - mxsdhci 4 14: 0 - CSPI_IRQ 5 25: 2 - imxdi - mxcsdma 6 35: 0 - ehci_hcd:usb1 7 37 2453 - mxcintuart 8 46: 3 - m - i.MX Timer Tick 9 57: 0 - mxsdhci10 Err: 0
在内核中查找 Timer Tick的源代码,如下所示:
1 /* linux-2.6.31/arch/arm/plat-mxc/time.c 2 * IRQ handler for the timer 3 */ 4 static irqreturn_t mxc_timer_interrupt(int irq, void *dev_id) 5 { 6 struct clock_event_device *evt = &clockevent_mxc; 7 uint32_t tstat; 8 9 if (timer_is_v2())10 tstat = __raw_readl(timer_base + MX3_TSTAT);11 else12 tstat = __raw_readl(timer_base + MX1_2_TSTAT);13 14 gpt_irq_acknowledge();15 16 evt->event_handler(evt);17 18 return IRQ_HANDLED;19 }20 21 static struct irqaction mxc_timer_irq = {22 .name = "i.MX Timer Tick",23 .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,24 .handler = mxc_timer_interrupt,25 };
在这个函数中,我们可以增加一些代码:有点类似看门狗
一.在 mxc_timer_interrupt中增加打印语句
在mxc_timer_interrupt 中断函数中检测系统当前正在运行的中断,如果10S之内都是同一个进程正在运行的话,那就我们就把这个进程打印出来(先从简单入手,此处先不做太多的复杂事情)
步骤:
①首先备份 linux-2.6.31/arch/arm/plat-mxc/time.c,
②接着修改time.c的内容,
③最后编译内核,重新给板子启动新内核
root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# cp time.c time.c.bak 修改time.c,再中断函数中加入打印语句 root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# vi time.c ************************************************************************************************ root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# cd ../../.. 编译内核 root@Lover雪:/home/study/nfs_home/system/linux-2.6.31# make uImage CHK include/linux/version.h make[1]: 'include/asm-arm/mach-types.h' is up to date. CHK include/linux/utsrelease.h SYMLINK include/asm -> include/asm-arm ************************************************************************************************ Data Size: 2180620 Bytes = 2129.51 kB = 2.08 MB Load Address: 80008000 Entry Point: 80008000 Image arch/arm/boot/uImage is ready root@Lover雪:/home/study/nfs_home/system/linux-2.6.31# cp arch/arm/boot/uImage /tftpboot/uImage root@Lover雪:/home/study/nfs_home/system/linux-2.6.31# ************************************************************************************************ 在开发板上重新烧写内核 MX25 U-Boot > run upsystem FEC: enable RMII gasket ver 192.168.31.179; our IP address is 192.168.31.180 Filename '00 Loading: ################################################################# ################################################################# ################### done ************************************************************************************************ 加载完毕后,如果不动开发板,会发现,每隔10s钟,就会有进程pid=0,名字name=swapper的打印消息. root@EasyARM-iMX257 ~# mxc_timer_interrupt: pid = 0, name = swapper root@EasyARM-iMX257 ~# mxc_timer_interrupt: pid = 0, name = swapper root@EasyARM-iMX257 ~# |
修改time.c如下所示:
1 /* linux-2.6.31/arch/arm/plat-mxc/time.c 2 * IRQ handler for the timer 3 */ 4 static irqreturn_t mxc_timer_interrupt(int irq, void *dev_id) 5 { 6 struct clock_event_device *evt = &clockevent_mxc; 7 uint32_t tstat; 8 //// 9 static pid_t pre_pid;10 static int cnt = 0;11 if(pre_pid == current->pid){12 cnt++;13 }else{14 cnt = 0;15 pre_pid = current->pid;16 }17 if(cnt == 10*HZ){18 cnt = 0;19 printk("mxc_timer_interrupt: pid = %d, name = %s\n",current->pid, current->comm);20 }21 //22 if (timer_is_v2())23 tstat = __raw_readl(timer_base + MX3_TSTAT);24 else25 tstat = __raw_readl(timer_base + MX1_2_TSTAT);26 27 gpt_irq_acknowledge();28 29 evt->event_handler(evt);30 31 return IRQ_HANDLED;32 }
二.修改错误代码,在代码中增加死循环
还是沿用我们前面的err_led.c的驱动程序.
参考博客地址:
再open函数中,我们故意加入一个死循环.
/* err_led.c */ 44 static int key_open(struct inode *inode, struct file *file) 45 { 46 printk("<0>function open!\n\n"); 47 //在此加入一个死循环 48 while(1); 49 return 0; 50 }
编译接着在开发板中加载错误驱动程序,使用cat 命令打开设备.
root@EasyARM-iMX257 ~# ifconfig eth0 192.168.31.181;mount -t nfs 192.168.31.179: /home/study/nfs_home /mnt/nfs -o nolock;cd /mnt/nfs/module/ root@EasyARM-iMX257 /mnt/nfs/module# root@EasyARM-iMX257 /mnt/nfs/module# cd 39_debug_with_timer/ root@EasyARM-iMX257 /mnt/nfs/module/39_debug_with_timer# insmod err_led.ko Hello,this is err_led_dev module! addr base_iomux : c4a26000 addr base_gpio3 : c4a2a000 addCTL : c4a26270 addr GDIR_GPIO3a2a000 root@EasyARM-iMX257 /mnt/nfs/module/39_debug_with_timer# cat /dev/err_led_dev function open! ################################################################# 可以发现,打开设备后,进入open函数,系统直接进入死机状态,每格10s中便会打印出我们的进程号pid=1805 mxc_timer_interrupt: pid = 1805, name = cat mxc_timer_interrupt: pid = 1805, name = cat mxc_timer_interrupt: pid = 1805, name = cat |
三.修改错误代码,在代码中增加死循环
接着恢复上面的time.c的代码,我们找到linux-2.6.31/arch/arm/kernel/irq.c文件中找打系统中断总调用者asm_do_IRQ,
我们在asm_do_IRQ函数里加入前面time.c中的打印代码.
root@Lover雪:/home/study/nfs_home/system/linux-2.6.31# cd arch/arm/plat-mxc/ root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# mv time.c.bak time.c root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/plat-mxc# cd .. root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm# cd kernel/ root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/kernel# vi irq.c root@Lover雪:/home/study/nfs_home/system/linux-2.6.31/arch/arm/kernel# cd ../../../ root@Lover雪:/home/study/nfs_home/system/linux-2.6.31# make uImage ######################################################## Load Address: 80008000 Entry Point: 80008000 Image arch/arm/boot/uImage is ready root@Lover雪:/home/study/nfs_home/system/linux-2.6.31# cp arch/arm/boot/uImage /tftpboot/uImage root@Lover雪:/home/study/nfs_home/system/linux-2.6.31# ######################################################## 从开发板重新烧写新内核 启动开发板 |
Irq.c修改内容如下:
1 /* linux-2.6.31/arch/arm/kernel/irq.c 2 * do_IRQ handles all hardware IRQ's. Decoded IRQs should not 3 * come via this function. Instead, they should provide their 4 * own 'handler' 5 */ 6 asmlinkage void __exception asm_do_IRQ(unsigned int irq, struct pt_regs *regs) 7 { 8 struct pt_regs *old_regs = set_irq_regs(regs); 9 ////10 //从 cat /proc/interrupts 中得到我们的tick中断为4611 if(irq == 46)12 {13 ////14 static pid_t pre_pid;15 static int cnt = 0;16 if(pre_pid == current->pid){17 cnt++;18 }else{19 cnt = 0;20 pre_pid = current->pid;21 }22 if(cnt == 10*HZ){23 cnt = 0;24 printk("asm_do_IRQ => mxc_timer_interrupt: pid = %d, name = %s\n",current->pid, current->comm);25 printk("pc = %08x\n",regs->ARM_pc);//ptract.h26 }27 /////28 }29 ////30 31 irq_enter();32 33 /*34 * Some hardware gives randomly wrong interrupts. Rather35 * than crashing, do something sensible.36 */37 if (unlikely(irq >= NR_IRQS)) {38 if (printk_ratelimit())39 printk(KERN_WARNING "Bad IRQ%u\n", irq);40 ack_bad_irq(irq);41 } else {42 generic_handle_irq(irq);43 }44 45 /* AT91 specific workaround */46 irq_finish(irq);47 48 irq_exit();49 set_irq_regs(old_regs);50 }
启动开发板,加载错误的驱动程序,根据打印出来的PC值来反推错误地址:
root@EasyARM-iMX257 /mnt/nfs/module/39_debug_with_timer# insmod err_led.ko root@EasyARM-iMX257 /mnt/nfs/module/39_debug_with_timer# cat /dev/err_led_dev function open! |
根据打印出来的pc值,再 cat /proc/kallmps,找到错误的驱动err_led.ko,对其进行反汇编,然后找到错误的函数,进而反推出c语言代码出错位置.
92 00000130 <key_open>: 93 130: e52de004 str lr, [sp, #-4]! 94 134: e59f0008 ldr r0, [pc, #8] ; 144 <.text+0x144> 95 138: e24dd004 sub sp, sp, #4 ; 0x4 96 13c: ebfffffe bl 0 <printk> 97 140: eafffffe b 140 <key_open+0x10> //很容易就找到了错误地址,此处一直b 140就为死循环 98 144: 000000cc andeq r0, r0, ip, asr #1 99 |
步骤和前面的博客文章
<>一样了,
此处不再赘述,博客地址:
如果要调试应用程序,可以使用strace,具体的用法,百度上有很详细的解释
附上驱动程序err_led.c
1 #include2 #include 3 #include 4 #include 5 #include 6 #include 7 #include 8 #include 9 #include 10 #include 11 #include 12 #include 13 #include 14 15 #define Driver_NAME "err_led_dev" 16 #define DEVICE_NAME "err_led_dev" 17 18 static int major = 0; 19 20 #define LED_ON 0 21 #define LED_OFF 1 22 23 24 //auto to create device node 25 static struct class *drv_class = NULL; 26 static struct class_device *drv_class_dev = NULL; 27 28 //寄存器基址; 29 static unsigned long mem_iomux; 30 static unsigned long mem_gpio3; 31 static unsigned long base_iomux; //iomux基址 0X 43FA C000 - 0X 43FA FFFF 32 static unsigned long base_gpio3; //gpio3 0X 53FA 4000 - 0X 53FA 7FFF 33 // MUX_CTL模式选择 配置寄存器 34 #define MUX_CTL (*(volatile unsigned long *)(base_iomux + 0x0060)) 35 // PAD_CTL GPIO常用功能设置 36 #define PAD_CTL (*(volatile unsigned long *)(base_iomux + 0x0270)) 37 // GPIO DR 数据寄存器 DR 38 #define DR_GPIO3 (*(volatile unsigned long *)(base_gpio3 + 0x0000)) 39 // GPIO GDIR 方向控制寄存器 GDIR 40 #define GDIR_GPIO3 (*(volatile unsigned long *)(base_gpio3 + 0x0004)) 41 42 43 static int key_open(struct inode *inode, struct file *file) 44 { 45 printk("<0>function open!\n\n"); 46 //// 47 //在此加入一个死循环 48 while(1); 49 //// 50 return 0; 51 } 52 53 static int key_read(struct file *filp, char __user *buff, size_t count, loff_t *offp) 54 { 55 return 0; 56 } 57 58 static ssize_t key_write(struct file *file, const char __user *buf, size_t count, loff_t * ppos) 59 { 60 printk("<0>function write!\n\n"); 61 return 1; 62 } 63 64 static int key_release(struct inode *inode, struct file *filp) 65 { 66 printk("<0>function write!\n\n"); 67 return 0; 68 } 69 70 static int key_ioctl(struct inode *inode,struct file *flip,unsigned int command,unsigned long arg) 71 { 72 printk("<0>function ioctl!\n\n"); 73 74 switch(command) 75 { 76 case LED_ON: 77 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零 亮 78 break; 79 case LED_OFF: 80 81 DR_GPIO3 |= (0x01 << 23); //将GPIO2_23置1 灭 82 break; 83 default: 84 break; 85 } 86 87 return 0; 88 } 89 static struct file_operations key_fops = { 90 .owner = THIS_MODULE, /* 这是一个宏,推向编译模块时自动创建的__this_module变量 */ 91 .open = key_open, 92 .read = key_read, 93 .write = key_write, 94 .release= key_release, 95 .ioctl = key_ioctl, 96 }; 97 98 void gpio_addr(void){ 99 printk("<0>addr base_iomux : %x \n",base_iomux);100 printk("<0>addr base_gpio3 : %x \n",base_gpio3);101 printk("<0>addr MUX_CTL : %x \n",&MUX_CTL);102 printk("<0>addr PAD_CTL : %x \n",&PAD_CTL);103 printk("<0>addr GDIR_GPIO3 : %x \n",&GDIR_GPIO3);104 printk("<0>addr DR_GPIO3 : %x \n",&DR_GPIO3);105 }106 107 108 109 void led_on_off(void){110 ssleep(1);111 DR_GPIO3 |= (0x01 << 23); //将GPIO2_23置1112 ssleep(1);113 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零114 ssleep(1);115 DR_GPIO3 |= (0x01 << 23); //将GPIO2_23置1116 ssleep(1);117 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零118 ssleep(1);119 DR_GPIO3 |= (0x01 << 23); //将GPIO2_23置1120 ssleep(1);121 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零122 ssleep(1);123 DR_GPIO3 |= (0x01 << 23); //将GPIO2_23置1124 ssleep(1);125 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零126 ssleep(1);127 DR_GPIO3 |= (0x01 << 23); //将GPIO2_23置1128 }129 130 static int __init key_irq_init(void)131 {132 printk("<0>\nHello,this is %s module!\n\n",Driver_NAME);133 //register and mknod134 major = register_chrdev(0,Driver_NAME,&key_fops);135 drv_class = class_create(THIS_MODULE,Driver_NAME);136 drv_class_dev = device_create(drv_class,NULL,MKDEV(major,0),NULL,DEVICE_NAME); /*/dev/key_query*/137 138 //IO端口申请 ioremap 可以直接通过指针来访问这些地址139 base_iomux = ioremap(0x43FAC000,0xFFF);140 base_gpio3 = ioremap(0x53FA4000,0xFFF);141 142 //MUX_CTL143 MUX_CTL &= ~(0x07 << 0); 144 MUX_CTL |= (0X05 << 0); //设置为ALT5 GPIO3_23 ERR_LED145 //PAD_CTL146 PAD_CTL &= ~(0x01<<13 | 0x01<<3 | 0x03<<1 | 0x01<<0); //1.8v 不需要上拉下拉 CMOS输出 slew rate147 //GDIR_GPIO3 配置为输出模式148 GDIR_GPIO3 &= ~(0x01 << 23); 149 GDIR_GPIO3 |= (0x01 << 23); //配置为输出模式 150 151 //DR_GPIO3 配置为输出0 点亮ERR_LED152 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零153 DR_GPIO3 &= ~(0x01 << 23); //将GPIO2_23清零154 gpio_addr();155 led_on_off();156 return 0; 157 }158 159 static void __exit key_irq_exit(void)160 {161 gpio_addr();162 printk("<0>\nGoodbye,%s!\n\n",Driver_NAME);163 led_on_off();164 165 unregister_chrdev(major,Driver_NAME);166 device_unregister(drv_class_dev);167 class_destroy(drv_class);168 169 //释放IO端口170 iounmap(base_iomux);171 iounmap(base_gpio3);172 }173 174 175 /* 这两行指定驱动程序的初始化函数和卸载函数 */176 module_init(key_irq_init);177 module_exit(key_irq_exit);178 179 /* 描述驱动程序的一些信息,不是必须的 */180 MODULE_AUTHOR("Lover雪儿");181 MODULE_VERSION("0.1.0");182 MODULE_DESCRIPTION("IMX257 key Driver");183 MODULE_LICENSE("GPL");