diff -ur -x ov511* linux-2.4.5.orig/include/linux/mm.h linux-2.4.5-exp/include/linux/mm.h --- linux-2.4.5.orig/include/linux/mm.h Sat May 26 02:01:28 2001 +++ linux-2.4.5-exp/include/linux/mm.h Thu Jun 7 23:24:11 2001 @@ -10,6 +10,7 @@ #include #include #include +#include extern unsigned long max_mapnr; extern unsigned long num_physpages; @@ -18,6 +19,7 @@ /* The inactive_clean lists are per zone. */ extern struct list_head active_list; extern struct list_head inactive_dirty_list; +extern struct list_head in_core_list; #include #include @@ -167,6 +169,7 @@ #define PG_skip 10 #define PG_inactive_clean 11 #define PG_highmem 12 +#define PG_in_core 13 /* bits 21-29 unused */ #define PG_arch_1 30 #define PG_reserved 31 @@ -239,6 +242,10 @@ #define SetPageInactiveClean(page) set_bit(PG_inactive_clean, &(page)->flags) #define ClearPageInactiveClean(page) clear_bit(PG_inactive_clean, &(page)->flags) +#define PageInCore(page) test_bit(PG_in_core, &(page)->flags) +#define SetPageInCore(page) set_bit(PG_in_core, &(page)->flags) +#define ClearPageInCore(page) clear_bit(PG_in_core, &(page)->flags) + #ifdef CONFIG_HIGHMEM #define PageHighMem(page) test_bit(PG_highmem, &(page)->flags) #else @@ -426,6 +433,24 @@ extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void swapin_readahead(swp_entry_t); + +/* + * Work out if there are any other processes sharing this + * swap cache page. Never mind the buffer. + */ +static inline int exclusive_swap_page(struct page *page) +{ + unsigned int count; + + if (!PageLocked(page)) + BUG(); + if (!PageSwapCache(page)) + return 0; + count = page_count(page) - !!page->buffers; + count += swap_count(page); + + return count == 3; +} /* mmap.c */ extern void lock_vma_mappings(struct vm_area_struct *); Only in linux-2.4.5-exp/include/linux: mm.h.orig diff -ur -x ov511* linux-2.4.5.orig/include/linux/swap.h linux-2.4.5-exp/include/linux/swap.h --- linux-2.4.5.orig/include/linux/swap.h Sat May 26 02:01:27 2001 +++ linux-2.4.5-exp/include/linux/swap.h Thu Jun 7 23:24:11 2001 @@ -69,6 +69,7 @@ extern unsigned int nr_free_buffer_pages(void); extern int nr_active_pages; extern int nr_inactive_dirty_pages; +extern int nr_incore_pages; extern atomic_t nr_async_pages; extern struct address_space swapper_space; extern atomic_t page_cache_size; @@ -107,7 +108,7 @@ extern int page_launder(int, int); extern int free_shortage(void); extern int inactive_shortage(void); -extern void wakeup_kswapd(void); +extern void wakeup_kswapd(int block); extern int try_to_free_pages(unsigned int gfp_mask); /* linux/mm/page_io.c */ @@ -133,6 +134,7 @@ extern void __delete_from_swap_cache(struct page *page); extern void delete_from_swap_cache(struct page *page); extern void delete_from_swap_cache_nolock(struct page *page); +extern void free_page_and_swap_cache(struct page *page); /* linux/mm/swapfile.c */ extern unsigned int nr_swapfiles; @@ -173,8 +175,8 @@ * can chose a fairly large maximum. */ #define PAGE_AGE_START 2 -#define PAGE_AGE_ADV 3 -#define PAGE_AGE_MAX 64 +#define PAGE_AGE_ADV 4 +#define PAGE_AGE_MAX 128 /* * List add/del helper macros. These must be called @@ -190,6 +192,7 @@ #define add_page_to_active_list(page) { \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ + if(PageInCore(page)) del_page_from_incore_list(page); \ SetPageActive(page); \ list_add(&(page)->lru, &active_list); \ nr_active_pages++; \ @@ -198,6 +201,7 @@ #define add_page_to_inactive_dirty_list(page) { \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ + if(PageInCore(page)) del_page_from_incore_list(page); \ SetPageInactiveDirty(page); \ list_add(&(page)->lru, &inactive_dirty_list); \ nr_inactive_dirty_pages++; \ @@ -207,11 +211,21 @@ #define add_page_to_inactive_clean_list(page) { \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ + if(PageInCore(page)) del_page_from_incore_list(page); \ SetPageInactiveClean(page); \ list_add(&(page)->lru, &page->zone->inactive_clean_list); \ page->zone->inactive_clean_pages++; \ } +#define add_page_to_incore_list(page) { \ + DEBUG_ADD_PAGE \ + ZERO_PAGE_BUG \ + if(PageInCore(page)) BUG(); \ + SetPageInCore(page); \ + list_add(&(page)->lru, &in_core_list); \ + nr_incore_pages++; \ +} + #define del_page_from_active_list(page) { \ list_del(&(page)->lru); \ ClearPageActive(page); \ @@ -233,6 +247,15 @@ list_del(&(page)->lru); \ ClearPageInactiveClean(page); \ page->zone->inactive_clean_pages--; \ + DEBUG_ADD_PAGE \ + ZERO_PAGE_BUG \ +} + +#define del_page_from_incore_list(page) { \ + if(!PageInCore(page)) BUG(); \ + list_del(&(page)->lru); \ + ClearPageInCore(page); \ + nr_incore_pages--; \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ } Only in linux-2.4.5-exp/include/linux: swap.h.orig diff -ur -x ov511* linux-2.4.5.orig/mm/memory.c linux-2.4.5-exp/mm/memory.c --- linux-2.4.5.orig/mm/memory.c Fri Apr 27 22:23:25 2001 +++ linux-2.4.5-exp/mm/memory.c Thu Jun 7 22:17:51 2001 @@ -274,7 +274,7 @@ */ if (pte_dirty(pte) && page->mapping) set_page_dirty(page); - page_cache_release(page); + free_page_and_swap_cache(page); return 1; } swap_free(pte_to_swp_entry(pte)); @@ -869,23 +869,6 @@ flush_page_to_ram(new_page); flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); -} - -/* - * Work out if there are any other processes sharing this - * swap cache page. Never mind the buffers. - */ -static inline int exclusive_swap_page(struct page *page) -{ - unsigned int count; - - if (!PageLocked(page)) - BUG(); - if (!PageSwapCache(page)) - return 0; - count = page_count(page) - !!page->buffers; /* 2: us + swap cache */ - count += swap_count(page); /* +1: just swap cache */ - return count == 3; /* =3: total */ } diff -ur -x ov511* linux-2.4.5.orig/mm/mmap.c linux-2.4.5-exp/mm/mmap.c --- linux-2.4.5.orig/mm/mmap.c Thu May 24 23:20:18 2001 +++ linux-2.4.5-exp/mm/mmap.c Thu Jun 7 23:24:11 2001 @@ -56,8 +56,11 @@ unsigned long free; - /* Sometimes we want to use more memory than we have. */ - if (sysctl_overcommit_memory) + /* + * Sometimes we want to use more memory than we have. + * This includes for root, since he probably needs to log in. + */ + if (sysctl_overcommit_memory || current->uid == 0) return 1; free = atomic_read(&buffermem_pages); @@ -71,7 +74,7 @@ * for the swap-space over-allocation (ie "nr_swap_pages" being * too small. */ - free += swapper_space.nrpages; + /* free += swapper_space.nrpages; */ /* * The code below doesn't account for free space in the inode @@ -79,8 +82,16 @@ * dentries which will become freeable under VM load, etc. * Lets just hope all these (complex) factors balance out... */ - free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT; - free += (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT; + /* free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT; + free += (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT; + */ + + /* + * We want to stop reservation before OOM if possible. + */ + free -= freepages.high; + if(current->mm) + free -= (current->mm->total_vm / 4); return free > pages; } diff -ur -x ov511* linux-2.4.5.orig/mm/oom_kill.c linux-2.4.5-exp/mm/oom_kill.c --- linux-2.4.5.orig/mm/oom_kill.c Tue May 15 08:25:41 2001 +++ linux-2.4.5-exp/mm/oom_kill.c Thu Jun 7 23:24:11 2001 @@ -191,15 +191,17 @@ */ int out_of_memory(void) { - /* Enough free memory? Not OOM. */ - if (nr_free_pages() > freepages.min) - return 0; + unsigned long free; - if (nr_free_pages() + nr_inactive_clean_pages() > freepages.low) + /* Enough free memory or swap space? Not OOM. */ + free = nr_free_pages() + nr_swap_pages; + if(free > freepages.high) return 0; - /* Enough swap space left? Not OOM. */ - if (nr_swap_pages > 0) + /* Buffers & cache can be reclaimed. */ + free += atomic_read(&buffermem_pages); + free += atomic_read(&page_cache_size); + if(free > freepages.high) return 0; /* Else... */ diff -ur -x ov511* linux-2.4.5.orig/mm/page_alloc.c linux-2.4.5-exp/mm/page_alloc.c --- linux-2.4.5.orig/mm/page_alloc.c Sat May 26 00:55:23 2001 +++ linux-2.4.5-exp/mm/page_alloc.c Thu Jun 7 23:24:11 2001 @@ -21,6 +21,7 @@ int nr_swap_pages; int nr_active_pages; int nr_inactive_dirty_pages; +int nr_incore_pages; pg_data_t *pgdat_list; static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; @@ -30,6 +31,7 @@ struct list_head active_list; struct list_head inactive_dirty_list; +struct list_head in_core_list; /* * Free_page() adds the page to the free lists. This is optimized for * fast normal cases (no error jumps taken normally). @@ -299,6 +301,9 @@ if (order == 0 && (gfp_mask & __GFP_WAIT)) direct_reclaim = 1; + if(free_shortage()) + wakeup_kswapd(0); + try_again: /* * First, see if we have any zones with lots of free memory. @@ -365,7 +370,7 @@ * - if we don't have __GFP_IO set, kswapd may be * able to free some memory we can't free ourselves */ - wakeup_kswapd(); + wakeup_kswapd(0); if (gfp_mask & __GFP_WAIT) { __set_current_state(TASK_RUNNING); current->policy |= SCHED_YIELD; @@ -436,10 +441,12 @@ * - moving clean pages from the inactive dirty list to * the inactive clean list. (done by page_launder) */ - if (gfp_mask & __GFP_WAIT) { - memory_pressure++; - try_to_free_pages(gfp_mask); - goto try_again; + if ((gfp_mask & (__GFP_WAIT | __GFP_IO)) == (__GFP_WAIT | __GFP_IO)) { + int progress = try_to_free_pages(gfp_mask); + if(!progress) { + wakeup_kswapd(1); + goto try_again; + } } } @@ -748,6 +755,7 @@ memlist_init(&active_list); memlist_init(&inactive_dirty_list); + memlist_init(&in_core_list); /* * Some architectures (with lots of mem and discontinous memory diff -ur -x ov511* linux-2.4.5.orig/mm/swap.c linux-2.4.5-exp/mm/swap.c --- linux-2.4.5.orig/mm/swap.c Mon Jan 22 21:30:21 2001 +++ linux-2.4.5-exp/mm/swap.c Thu Jun 7 23:24:11 2001 @@ -105,13 +105,15 @@ */ void age_page_down_ageonly(struct page * page) { - page->age /= 2; + if(page->age) + page->age--; } void age_page_down_nolock(struct page * page) { /* The actual page aging bit */ - page->age /= 2; + if(page->age) + page->age--; /* * The page is now an old page. Move to the inactive @@ -139,7 +141,8 @@ void age_page_down(struct page * page) { /* The actual page aging bit */ - page->age /= 2; + if(page->age) + page->age--; /* * The page is now an old page. Move to the inactive @@ -211,8 +214,8 @@ } /* Make sure the page gets a fair chance at staying active. */ - if (page->age < PAGE_AGE_START) - page->age = PAGE_AGE_START; + if (page->age < PAGE_AGE_MAX >> 1) + page->age = PAGE_AGE_MAX >> 1; } void activate_page(struct page * page) diff -ur -x ov511* linux-2.4.5.orig/mm/swap_state.c linux-2.4.5-exp/mm/swap_state.c --- linux-2.4.5.orig/mm/swap_state.c Thu May 24 23:20:18 2001 +++ linux-2.4.5-exp/mm/swap_state.c Thu Jun 7 23:24:11 2001 @@ -146,6 +146,30 @@ } /* + * Perform a free_page(), also freeing any swap cache associated with + * this page if it is the last user of the page. Can not do a lock_page, + * as we are holding the page_table_lock spinlock. + */ +void free_page_and_swap_cache(struct page *page) +{ + /* + * If we are the only user, then try to free up the swap cache. + * + * It's ok to check for PageSwapCache without the page lock + * here because we are going to recheck again inside + * exclusive_swap_page() _with_ the lock. + * - Marcelo + */ + if (PageSwapCache(page) && !TryLockPage(page)) { + if (exclusive_swap_page(page)) + delete_from_swap_cache_nolock(page); + UnlockPage(page); + } + page_cache_release(page); +} + + +/* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the page @@ -219,6 +243,12 @@ BUG(); add_to_swap_cache(new_page, entry); rw_swap_page(READ, new_page); + + /* + * Physical I/O is expensive, make sure this page stays in + * as long as possible. + */ + new_page->age = PAGE_AGE_MAX >> 1; return new_page; out_free_page: Only in linux-2.4.5-exp/mm: swap_state.c.orig diff -ur -x ov511* linux-2.4.5.orig/mm/vmscan.c linux-2.4.5-exp/mm/vmscan.c --- linux-2.4.5.orig/mm/vmscan.c Sat May 26 01:00:18 2001 +++ linux-2.4.5-exp/mm/vmscan.c Thu Jun 7 23:24:11 2001 @@ -41,11 +41,20 @@ pte_t pte; swp_entry_t entry; - /* Don't look at this pte if it's been accessed recently. */ + /* Age it up, if it's been accessed recently. */ if (ptep_test_and_clear_young(page_table)) { - page->age += PAGE_AGE_ADV; - if (page->age > PAGE_AGE_MAX) - page->age = PAGE_AGE_MAX; + age_page_up(page); + } + + /* If it's been accessed recently, leave it alone for the time being. + * Also make sure it's in the in-core list. + */ + if(page->age) { + spin_lock(&pagemap_lru_lock); + if(!PageInCore(page) && !PageActive(page) && + !PageInactiveClean(page) && !PageInactiveDirty(page)) + add_page_to_incore_list(page); + spin_unlock(&pagemap_lru_lock); return; } @@ -232,6 +241,13 @@ if (!count) return 1; + + /* Don't penalise processes which are already small + * compared to the one causing swapping out + */ + if(current->mm && mm->rss < current->mm->total_vm) + return 0; + /* * Go through process' page directory. */ @@ -269,7 +285,7 @@ * N.B. This function returns only 0 or 1. Return values != 1 from * the lower level routines result in continued processing. */ -#define SWAP_SHIFT 5 +#define SWAP_SHIFT 6 #define SWAP_MIN 8 static inline int swap_amount(struct mm_struct *mm) @@ -408,6 +424,25 @@ return page; } +/* Check for dead swapcache pages and clean them. */ +static inline int clean_dead_swap_page (struct page* page) +{ + int ret = 0; + if(!TryLockPage(page)) { + if (PageSwapCache(page) && PageDirty(page) && + (page_count(page) - !!page->buffers) == 1 && + swap_count(page) == 1) { + ClearPageDirty(page); + ClearPageReferenced(page); + page->age = 0; + ret = 1; + } + UnlockPage(page); + } + return ret; +} + + /** * page_launder - clean dirty inactive pages, move to inactive_clean list * @gfp_mask: what operations we are allowed to do @@ -461,6 +496,9 @@ continue; } + /* Check for dead swapcache pages and clean them. */ + clean_dead_swap_page(page); + /* Page is or was in use? Move it to the active list. */ if (PageReferenced(page) || page->age > 0 || (!page->buffers && page_count(page) > 1) || @@ -638,6 +676,42 @@ } /** + * in_core_scan - ages the in-core (not in swapcache or buffers) pages + * + * This function will scan each in-core page exactly once (if it has been + * found by a previous swap_out() scan) and perform ageing on it. + */ +void in_core_scan(void) +{ + struct list_head * page_lru; + struct page * page; + int maxscan = nr_incore_pages; + + /* Take the lock while messing with the list... */ + spin_lock(&pagemap_lru_lock); + while (maxscan-- > 0 && (page_lru = in_core_list.prev) != &in_core_list) { + page = list_entry(page_lru, struct page, lru); + if(!PageInCore(page)) { + printk("VM: in_core_scan(), wrong page on list!"); + list_del(page_lru); + nr_incore_pages--; + continue; + } + + /* Shuffle the clock */ + list_del(page_lru); + list_add(page_lru, &in_core_list); + + /* Do aging on the pages. */ + if (PageTestandClearReferenced(page)) { + age_page_up_nolock(page); + } else { + age_page_down_ageonly(page); + } + } +} + +/** * refill_inactive_scan - scan the active list and find pages to deactivate * @priority: the priority at which to scan * @target: number of pages to deactivate, zero for background aging @@ -649,30 +723,16 @@ { struct list_head * page_lru; struct page * page; - int maxscan = nr_active_pages >> priority; + int maxscan = nr_active_pages; int page_active = 0; int nr_deactivated = 0; /* * When we are background aging, we try to increase the page aging - * information in the system. When we have too many inactive pages - * we don't do background aging since having all pages on the - * inactive list decreases aging information. - * - * Since not all active pages have to be on the active list, we round - * nr_active_pages up to num_physpages/2, if needed. + * information in the system. */ - if (!target) { - int inactive = nr_free_pages() + nr_inactive_clean_pages() + - nr_inactive_dirty_pages; - int active = MAX(nr_active_pages, num_physpages / 2); - if (active > 10 * inactive) - maxscan = nr_active_pages >> 4; - else if (active > 3 * inactive) - maxscan = nr_active_pages >> 8; - else - return 0; - } + if(!target) + maxscan = nr_active_pages >> 4; /* Take the lock while messing with the list... */ spin_lock(&pagemap_lru_lock); @@ -687,6 +747,13 @@ continue; } + /* special case for dead swapcache pages */ + if (clean_dead_swap_page(page)) { + deactivate_page_nolock(page); + nr_deactivated++; + continue; + } + /* Do aging on the pages. */ if (PageTestandClearReferenced(page)) { age_page_up_nolock(page); @@ -848,6 +915,7 @@ goto done; /* If refill_inactive_scan failed, try to page stuff out.. */ + in_core_scan(); swap_out(DEF_PRIORITY, gfp_mask); if (--maxtry <= 0) @@ -881,7 +949,7 @@ * If needed, we move pages from the active list * to the inactive list. */ - if (inactive_shortage()) + if (inactive_shortage() && !user) ret += refill_inactive(gfp_mask, user); /* @@ -892,6 +960,7 @@ return ret; } +struct task_struct * kswapd_tsk; DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); DECLARE_WAIT_QUEUE_HEAD(kswapd_done); @@ -930,6 +999,8 @@ */ tsk->flags |= PF_MEMALLOC; + kswapd_tsk = current; + /* * Kswapd main loop. */ @@ -951,6 +1022,7 @@ refill_inactive_scan(DEF_PRIORITY, 0); } + wake_up_all(&kswapd_done); run_task_queue(&tq_disk); /* @@ -977,14 +1049,35 @@ */ } else if (out_of_memory()) { oom_kill(); + } else { + /* Blink, to allow processes to run which can. */ + interruptible_sleep_on_timeout(&kswapd_wait, 1); } } } -void wakeup_kswapd(void) +void wakeup_kswapd(int block) { - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + DECLARE_WAITQUEUE(wait, current); + + if(current == kswapd_tsk) + return; + + if(!block) { + if (waitqueue_active(&kswapd_wait)) + wake_up_interruptible(&kswapd_wait); + return; + } + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&kswapd_done, &wait); + + if(waitqueue_active(&kswapd_wait)) + wake_up(&kswapd_wait); + schedule(); + + remove_wait_queue(&kswapd_done, &wait); + __set_current_state(TASK_RUNNING); } /*