From: Andrea Arcangeli <and...@suse.de> Subject: patch oom 2.3.x Date: 2000/01/08 Message-ID: <fa.iob4c8v.1b56k3f@ifi.uio.no> X-Deja-AN: 570031124 Original-Date: Sat, 8 Jan 2000 17:17:09 +0100 (CET) Sender: owner-linux-ker...@vger.rutgers.edu Original-Message-ID: <Pine.LNX.4.21.0001081708430.6128-100000@alpha.random> X-PGP-Key-URL: http://e-mind.com/~andrea/aa.asc To: Linus Torvalds <torva...@transmeta.com> X-Sender: and...@alpha.random Content-Type: TEXT/PLAIN; charset=US-ASCII X-Orcpt: rfc822;linux-kernel-outgoing-dig Organization: Internet mailing list MIME-Version: 1.0 X-GnuPG-Key-URL: http://e-mind.com/~andrea/aa.gnupg.asc Newsgroups: fa.linux.kernel X-Loop: majord...@vger.rutgers.edu o fixes the init-gets-sigsegv during oom. o avoids iopl'ed tasks to get SIGKILL and send them a SIGTERM so X won't screwup the console due oom in IA32. o makes the signal code to better react to oom. o updates a few places that doesn't know about the new handle_mm_fault interface yet (btw in ptrace David prefers to send the sigkill due oom to the traced `tsk` and not to the `current` tracer task, I left to him to send an incremental patch if he wants, actually I prefer the way on my patch because obviously safe). o removes the deprecated oom function. o avoid the swap algorithm to reassing stuff more than one time per-try. diff -urN 2.3.36pre5/arch/alpha/kernel/signal.c 2.3.36pre5-oom/arch/alpha/kernel/signal.c --- 2.3.36pre5/arch/alpha/kernel/signal.c Wed Nov 24 18:22:03 1999 +++ 2.3.36pre5-oom/arch/alpha/kernel/signal.c Mon Jan 3 19:00:05 2000 @@ -437,6 +437,8 @@ err |= __copy_to_user(frame->extramask, &set->sig[1], sizeof(frame->extramask)); } + if (err) + goto give_sigsegv; /* Set up to return from userspace. If provided, use a stub already in userspace. */ @@ -499,6 +501,8 @@ err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, sw, set->sig[0], oldsp); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) + goto give_sigsegv; /* Set up to return from userspace. If provided, use a stub already in userspace. */ diff -urN 2.3.36pre5/arch/alpha/mm/fault.c 2.3.36pre5-oom/arch/alpha/mm/fault.c --- 2.3.36pre5/arch/alpha/mm/fault.c Wed Nov 24 18:22:03 1999 +++ 2.3.36pre5-oom/arch/alpha/mm/fault.c Mon Jan 3 19:00:05 2000 @@ -130,13 +130,13 @@ * make sure we exit gracefully rather than endlessly redo * the fault. */ +survive: fault = handle_mm_fault(current, vma, address, cause > 0); - up(&mm->mmap_sem); - if (fault < 0) goto out_of_memory; if (fault == 0) goto do_sigbus; + up(&mm->mmap_sem); return; @@ -177,13 +177,23 @@ * us unable to handle the page fault gracefully. */ out_of_memory: - printk(KERN_ALERT "VM: killing process %s(%d)\n", - current->comm, current->pid); - if (!user_mode(regs)) - goto no_context; - do_exit(SIGKILL); + if (current->pid == 1) + { + current->policy |= SCHED_YIELD; + schedule(); + goto survive; + } + up(&mm->mmap_sem); + if (user_mode(regs)) + { + printk(KERN_ALERT "VM: killing process %s(%d)\n", + current->comm, current->pid); + do_exit(SIGKILL); + } + goto no_context; do_sigbus: + up(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel * or user mode. diff -urN 2.3.36pre5/arch/i386/kernel/signal.c 2.3.36pre5-oom/arch/i386/kernel/signal.c --- 2.3.36pre5/arch/i386/kernel/signal.c Wed Nov 24 18:22:03 1999 +++ 2.3.36pre5-oom/arch/i386/kernel/signal.c Mon Jan 3 19:00:05 2000 @@ -419,13 +419,19 @@ ? current->exec_domain->signal_invmap[sig] : sig), &frame->sig); + if (err) + goto give_sigsegv; err |= setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]); + if (err) + goto give_sigsegv; if (_NSIG_WORDS > 1) { err |= __copy_to_user(frame->extramask, &set->sig[1], sizeof(frame->extramask)); } + if (err) + goto give_sigsegv; /* Set up to return from userspace. If provided, use a stub already in userspace. */ @@ -486,6 +492,8 @@ err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); err |= __copy_to_user(&frame->info, info, sizeof(*info)); + if (err) + goto give_sigsegv; /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); @@ -497,6 +505,8 @@ err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) + goto give_sigsegv; /* Set up to return from userspace. If provided, use a stub already in userspace. */ diff -urN 2.3.36pre5/arch/i386/mm/fault.c 2.3.36pre5-oom/arch/i386/mm/fault.c --- 2.3.36pre5/arch/i386/mm/fault.c Wed Nov 24 18:22:04 1999 +++ 2.3.36pre5-oom/arch/i386/mm/fault.c Mon Jan 3 19:00:05 2000 @@ -31,6 +31,7 @@ { struct vm_area_struct * vma; unsigned long start = (unsigned long) addr; + int fault; if (!size) return 1; @@ -50,8 +51,12 @@ start &= PAGE_MASK; for (;;) { - if (handle_mm_fault(current, vma, start, 1) <= 0) - goto bad_area; +survive: + fault = handle_mm_fault(current, vma, start, 1); + if (!fault) + goto do_sigbus; + if (fault < 0) + goto out_of_memory; if (!size) break; size--; @@ -74,6 +79,19 @@ bad_area: return 0; + +do_sigbus: + force_sig(SIGBUS, current); + goto bad_area; + +out_of_memory: + if (current->pid == 1) + { + current->policy |= SCHED_YIELD; + schedule(); + goto survive; + } + goto bad_area; } static inline void handle_wp_test (void) @@ -188,6 +206,7 @@ * make sure we exit gracefully rather than endlessly redo * the fault. */ +survive: { int fault = handle_mm_fault(tsk, vma, address, write); if (fault < 0) @@ -280,10 +299,33 @@ * us unable to handle the page fault gracefully. */ out_of_memory: + if (tsk->pid == 1) + { + tsk->policy |= SCHED_YIELD; + schedule(); + goto survive; + } up(&mm->mmap_sem); - printk("VM: killing process %s\n", tsk->comm); if (error_code & 4) - do_exit(SIGKILL); + { + if (!((regs->eflags >> 12) & 3)) + { + printk(KERN_ALERT "VM: killing process %s\n", + tsk->comm); + do_exit(SIGKILL); + } + else + { + /* + * The task is running with privilegies and so we + * trust it and we give it a chance to die gracefully. + */ + printk(KERN_ALERT "VM: terminating process %s\n", + tsk->comm); + force_sig(SIGTERM, current); + return; + } + } goto no_context; do_sigbus: diff -urN 2.3.36pre5/fs/exec.c 2.3.36pre5-oom/fs/exec.c --- 2.3.36pre5/fs/exec.c Mon Jan 3 18:56:24 2000 +++ 2.3.36pre5-oom/fs/exec.c Mon Jan 3 19:02:11 2000 @@ -277,13 +277,13 @@ pmd = pmd_alloc(pgd, address); if (!pmd) { __free_page(page); - oom(tsk); + force_sig(SIGKILL, tsk); return; } pte = pte_alloc(pmd, address); if (!pte) { __free_page(page); - oom(tsk); + force_sig(SIGKILL, tsk); return; } if (!pte_none(*pte)) { diff -urN 2.3.36pre5/include/linux/mm.h 2.3.36pre5-oom/include/linux/mm.h --- 2.3.36pre5/include/linux/mm.h Mon Jan 3 18:56:25 2000 +++ 2.3.36pre5-oom/include/linux/mm.h Mon Jan 3 19:00:05 2000 @@ -400,7 +400,6 @@ unsigned int * zones_size, unsigned long zone_start_paddr); extern void mem_init(void); extern void show_mem(void); -extern void oom(struct task_struct * tsk); extern void si_meminfo(struct sysinfo * val); extern void swapin_readahead(swp_entry_t); diff -urN 2.3.36pre5/kernel/ptrace.c 2.3.36pre5-oom/kernel/ptrace.c --- 2.3.36pre5/kernel/ptrace.c Sun Nov 21 03:20:20 1999 +++ 2.3.36pre5-oom/kernel/ptrace.c Mon Jan 3 19:01:02 2000 @@ -26,6 +26,7 @@ unsigned long mapnr; unsigned long maddr; struct page *page; + int fault; repeat: pgdir = pgd_offset(vma->vm_mm, addr); @@ -64,8 +65,12 @@ fault_in_page: /* -1: out of memory. 0 - unmapped page */ - if (handle_mm_fault(tsk, vma, addr, write) > 0) + fault = handle_mm_fault(tsk, vma, addr, write); + if (fault > 0) goto repeat; + if (fault < 0) + /* the out of memory is been triggered by the current task. */ + force_sig(SIGKILL, current); return 0; bad_pgd: diff -urN 2.3.36pre5/mm/memory.c 2.3.36pre5-oom/mm/memory.c --- 2.3.36pre5/mm/memory.c Mon Jan 3 18:56:25 2000 +++ 2.3.36pre5-oom/mm/memory.c Mon Jan 3 19:00:05 2000 @@ -70,16 +70,6 @@ mem_map_t * mem_map = NULL; /* - * oom() prints a message (so that the user knows why the process died), - * and gives the process an untrappable SIGKILL. - */ -void oom(struct task_struct * task) -{ - printk("\nOut of memory for %s.\n", task->comm); - force_sig(SIGKILL, task); -} - -/* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ diff -urN 2.3.36pre5/mm/vmscan.c 2.3.36pre5-oom/mm/vmscan.c --- 2.3.36pre5/mm/vmscan.c Fri Dec 31 16:33:05 1999 +++ 2.3.36pre5-oom/mm/vmscan.c Mon Jan 3 19:00:05 2000 @@ -328,6 +328,7 @@ struct task_struct * p; int counter; int __ret = 0; + int assign = 0; lock_kernel(); /* @@ -347,12 +348,9 @@ counter = nr_threads / (priority+1); if (counter < 1) counter = 1; - if (counter > nr_threads) - counter = nr_threads; for (; counter >= 0; counter--) { - int assign = 0; - int max_cnt = 0; + unsigned long max_cnt = 0; struct mm_struct *best = NULL; int pid = 0; select: @@ -365,7 +363,7 @@ if (mm->rss <= 0) continue; /* Refresh swap_cnt? */ - if (assign) + if (assign == 1) mm->swap_cnt = mm->rss; if (mm->swap_cnt > max_cnt) { max_cnt = mm->swap_cnt; @@ -374,6 +372,8 @@ } } read_unlock(&tasklist_lock); + if (assign == 1) + assign = 2; if (!best) { if (!assign) { assign = 1; Andrea - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.rutgers.edu Please read the FAQ at http://www.tux.org/lkml/