现在的位置: 首页 > 综合 > 正文

OOM(out_of_memory) killer分析

2013年06月19日 ⁄ 综合 ⁄ 共 6399字 ⁄ 字号 评论关闭

Chipset: MSM8X25Q

Codebase: Android4.1

Kernel: 3.4.0

 

概念:

         OOMkiller,即out of memory killer,是linux下面的一种管理当内存耗尽时的处理机制。当内存较少时,OOM会遍历整个进程链表,然后根据进程的内存使用情况以及它的oom score值最终找到得分较高的进程,然后发送kill信号将其杀掉。

         伙伴系统中在分配内存时会做判断,当内存不足时,会调用核心函数out_of_memory(), 函数位于文件oom_kill.c@kernel/mm.

         下面先分析out_of_memory()。

out_of_memory():

void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
		int order, nodemask_t *nodemask, bool force_kill)
{
	const nodemask_t *mpol_mask;
	struct task_struct *p;
	unsigned long totalpages;
	unsigned long freed = 0;
	unsigned int points;
	enum oom_constraint constraint = CONSTRAINT_NONE;
	int killed = 0;

~~snip
/*如果当前已经有Pending的kill信号,那么马上返回。
毕竟oom最中为了free memory而执行sig kill。*/
	if (fatal_signal_pending(current)) {
		set_thread_flag(TIF_MEMDIE);
		return;
	}

~~snip
	/*用户空间可以通过/proc/sys/vm/panic_on_oom来改变oom的行为,
1表示oom的时候直接panic,0就只杀掉”best”进程而让系统继续运行。*/
	check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);

	read_lock(&tasklist_lock);
	
/*同样/proc/sys/vm/ oom_kill_allocating_task为true时表示直接将当前分配的task
给kill掉。*/
if (sysctl_oom_kill_allocating_task &&
	    !oom_unkillable_task(current, NULL, nodemask) &&
	    current->mm) {
		oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
				 nodemask,
				 "Out of memory (oom_kill_allocating_task)");
		goto out;
	}
	/*根据当前task的内存以oom score信息得到point值最高的那个。*/
	p = select_bad_process(&points, totalpages, NULL, mpol_mask,
			       force_kill);
	/* Found nothing?!?! Either we hang forever, or we panic. */
	if (!p) {
		dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
		read_unlock(&tasklist_lock);
		panic("Out of memory and no killable processes...\n");
	}
	if (PTR_ERR(p) != -1UL) {
		/*唔,被杀了,苦逼!*/
		oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
				 nodemask, "Out of memory");
		killed = 1;
	}
out:
	read_unlock(&tasklist_lock);

	/*
	 * Give "p" a good chance of killing itself before we
	 * retry to allocate memory unless "p" is current
	 */
	if (killed && !test_thread_flag(TIF_MEMDIE))
		schedule_timeout_uninterruptible(1);
}

select_bad_process():

static struct task_struct *select_bad_process(unsigned int *ppoints,
		unsigned long totalpages, struct mem_cgroup *memcg,
		const nodemask_t *nodemask, bool force_kill)
{
	struct task_struct *g, *p;
	struct task_struct *chosen = NULL;
	*ppoints = 0;
	/*遍历所有进程*/
	do_each_thread(g, p) {
		unsigned int points;
		/*处于退出的进程就不管了*/
		if (p->exit_state)
			continue;
		/*有些核心的线程不能杀,如init, kernel_thread*/
		if (oom_unkillable_task(p, memcg, nodemask))
			continue;
		/*正在被oom killing的进程也不管。*/
		if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
			if (unlikely(frozen(p)))
				__thaw_task(p);
			if (!force_kill)
				return ERR_PTR(-1UL);
		}
		if (!p->mm)
			continue;

		if (p->flags & PF_EXITING) {
			if (p == current) {
				chosen = p;
				*ppoints = 1000;
			} else if (!force_kill) {
				/*
				 * If this task is not being ptraced on exit,
				 * then wait for it to finish before killing
				 * some other task unnecessarily.
				 */
				if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
					return ERR_PTR(-1UL);
			}
		}
		/*计算task对应的points*/
		points = oom_badness(p, memcg, nodemask, totalpages);
		/*如果此task比上次的points要大,那么保存point.*/
		if (points > *ppoints) {
			chosen = p;
			*ppoints = points;
		}
	} while_each_thread(g, p);

	return chosen;
}

oom_badness():

unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
		      const nodemask_t *nodemask, unsigned long totalpages)
{
	long points;

	if (oom_unkillable_task(p, memcg, nodemask))
		return 0;

	p = find_lock_task_mm(p);
	if (!p)
		return 0;
	/*oom_score_adj为-1000的不做处理,此值可以通过/proc/pid_num/oom_score_adj设置,范围为-1000 ~ 1000,值越大越容易被oom kill掉。*/
	if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
		task_unlock(p);
		return 0;
	}

	/*
	 * The memory controller may have a limit of 0 bytes, so avoid a divide
	 * by zero, if necessary.
	 */
	if (!totalpages)
		totalpages = 1;

	/* get_mm_rss获取当前用户空间使用文件和匿名页占有内存数,nr_ptes 获取
当前保存页表使用的内存。*/
	points = get_mm_rss(p->mm) + p->mm->nr_ptes;
	/*获取交换内存使用的内存数*/
	points += get_mm_counter(p->mm, MM_SWAPENTS);
	/*每个task同等计算,可不管。*/
	points *= 1000;
	points /= totalpages;
	task_unlock(p);

	/*当该进程具有CAP_SYS_ADMIN能力,那么Point降低,因为具有ADMIN权限的
Task是被认为表现良好的。 */
	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
		points -= 30;

	/*加上oom_score_adj,范围从-1000 ~ 1000. */
	points += p->signal->oom_score_adj;

	/*
	 * Never return 0 for an eligible task that may be killed since it's
	 * possible that no single user task uses more than 0.1% of memory and
	 * no single admin tasks uses more than 3.0%.
	 */
	if (points <= 0)
		return 1;
	/*1000封顶*/
	return (points < 1000) ? points : 1000;
}

oom_kill_process():

static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
			     unsigned int points, unsigned long totalpages,
			     struct mem_cgroup *memcg, nodemask_t *nodemask,
			     const char *message)
{
	struct task_struct *victim = p;
	struct task_struct *child;
	struct task_struct *t = p;
	struct mm_struct *mm;
	unsigned int victim_points = 0;
	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
					      DEFAULT_RATELIMIT_BURST);

	/*
	 * If the task is already exiting, don't alarm the sysadmin or kill
	 * its children or threads, just set TIF_MEMDIE so it can die quickly
	 */
	if (p->flags & PF_EXITING) {
		set_tsk_thread_flag(p, TIF_MEMDIE);
		return;
	}

	if (__ratelimit(&oom_rs))
		dump_header(p, gfp_mask, order, memcg, nodemask);

	task_lock(p);
	pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
		message, task_pid_nr(p), p->comm, points);
	task_unlock(p);

	/*当前被选定子进程的mm和父进程不一样时,找到其中最高point
的children task,然后替代父进程被杀掉,所以当一个进程有多个子进程并且
真用较多内存时,子进程有可能被杀掉,而父进程还可以活着。 */
	do {
		list_for_each_entry(child, &t->children, sibling) {
			unsigned int child_points;

			if (child->mm == p->mm)
				continue;
			/*
			 * oom_badness() returns 0 if the thread is unkillable
			 */
			child_points = oom_badness(child, memcg, nodemask,
								totalpages);
			if (child_points > victim_points) {
				victim = child;
				victim_points = child_points;
			}
		}
	} while_each_thread(p, t);

	victim = find_lock_task_mm(victim);
	if (!victim)
		return;

	/* mm cannot safely be dereferenced after task_unlock(victim) */
	mm = victim->mm;
	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
		K(get_mm_counter(victim->mm, MM_FILEPAGES)));
	task_unlock(victim);

	/*
	 只要mm是一样的,也就是说共享内存的进程,都会和当前找到最高point的
指定进程一起被杀掉。 */
	for_each_process(p)
		if (p->mm == mm && !same_thread_group(p, victim) &&
		    !(p->flags & PF_KTHREAD)) {
			if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
				continue;

			task_lock(p);	/* Protect ->comm from prctl() */
			pr_err("Kill process %d (%s) sharing same memory\n",
				task_pid_nr(p), p->comm);
			task_unlock(p);
			/*发送 SIGKILL信号。*/
			do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
		}

	set_tsk_thread_flag(victim, TIF_MEMDIE);
	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
}

所以,out_of_memory()做的任务就是遍历系统全部进程,然后根据内存使用情况以及oom_score_adj的值计算得到一个point, 最终将最高point的task给kill掉。

相关知识:

1.      Malloc会引起OOM killer,可参考:

http://blog.dccmx.com/2011/04/oom-killer-on-linux

2.      OOM killer值是管理计算lowmemory部分,即使High memory有很多空闲内存。

3.      进程rss的计算可参考此文:

http://filwmm1314.blog.163.com/blog/static/2182591920121016541582/

4.      影响到oom killer行为的文件有:

/proc/sys/vm/overcommit_memory

/proc/sys/vm/panic_on_oom

/proc/sys/vm/oom_kill_allocating_task

/porc/pid_xxx/oom_score_adj

 

2013/04/27

抱歉!评论已关闭.