From 03701f15cd3659a163ed690d8eed7ed0efef23fe Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 23 Nov 2007 15:23:00 -0500 Subject: [PATCH] Linux 2.2.19pre2 o Drop the page aging for a moment to merge the Andrea VM o Merge Andrea's VM-global patch (Andrea Arcangeli) --- Makefile | 2 +- drivers/block/rd.c | 2 +- fs/binfmt_aout.c | 4 +- fs/binfmt_elf.c | 4 +- fs/buffer.c | 137 +++++++++++++++---------- fs/coda/file.c | 8 +- fs/dcache.c | 4 +- fs/open.c | 4 +- fs/read_write.c | 12 +-- include/linux/fs.h | 18 +++- include/linux/locks.h | 2 + include/linux/mm.h | 4 - include/linux/sched.h | 5 +- init/main.c | 2 - ipc/shm.c | 4 +- kernel/fork.c | 2 + mm/filemap.c | 231 +++++++----------------------------------- mm/page_alloc.c | 168 ++++++++++++++++++++---------- mm/swap_state.c | 1 + mm/vmscan.c | 55 ++++------ 20 files changed, 302 insertions(+), 367 deletions(-) diff --git a/Makefile b/Makefile index f057cc53e5b5..a751970a0f59 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 2 SUBLEVEL = 19 -EXTRAVERSION = pre1 +EXTRAVERSION = pre2 ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/) diff --git a/drivers/block/rd.c b/drivers/block/rd.c index f1d54b24810a..e1d9f1edba38 100644 --- a/drivers/block/rd.c +++ b/drivers/block/rd.c @@ -173,7 +173,7 @@ repeat: if (CURRENT->cmd == READ) memset(CURRENT->buffer, 0, len); else - set_bit(BH_Protected, &CURRENT->bh->b_state); + mark_buffer_protected(CURRENT->bh); end_request(1); goto repeat; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 8da1765dd480..d56d630462d0 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -62,9 +62,9 @@ static void set_brk(unsigned long start, unsigned long end) static int dump_write(struct file *file, const void *addr, int nr) { int r; - down(&file->f_dentry->d_inode->i_sem); + fs_down(&file->f_dentry->d_inode->i_sem); r = file->f_op->write(file, addr, nr, &file->f_pos) == nr; - up(&file->f_dentry->d_inode->i_sem); + fs_up(&file->f_dentry->d_inode->i_sem); return r; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 5d5b91b8076c..84e9ac54c9f9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -948,9 +948,9 @@ static int load_elf_library(int fd) static int dump_write(struct file *file, const void *addr, int nr) { int r; - down(&file->f_dentry->d_inode->i_sem); + fs_down(&file->f_dentry->d_inode->i_sem); r = file->f_op->write(file, addr, nr, &file->f_pos) == nr; - up(&file->f_dentry->d_inode->i_sem); + fs_up(&file->f_dentry->d_inode->i_sem); return r; } diff --git a/fs/buffer.c b/fs/buffer.c index b59b5b4bb740..3e27c36072f0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -83,6 +83,7 @@ static struct wait_queue * buffer_wait = NULL; static int nr_buffers = 0; static int nr_buffers_type[NR_LIST] = {0,}; +static unsigned long size_buffers_type[NR_LIST]; static int nr_buffer_heads = 0; static int nr_unused_buffer_heads = 0; static int nr_hashed_buffers = 0; @@ -359,9 +360,9 @@ asmlinkage int sys_fsync(unsigned int fd) goto out_putf; /* We need to protect against concurrent writers.. */ - down(&inode->i_sem); + fs_down(&inode->i_sem); err = file->f_op->fsync(file, dentry); - up(&inode->i_sem); + fs_up(&inode->i_sem); out_putf: fput(file); @@ -396,9 +397,9 @@ asmlinkage int sys_fdatasync(unsigned int fd) goto out_putf; /* this needs further work, at the moment it is identical to fsync() */ - down(&inode->i_sem); + fs_down(&inode->i_sem); err = file->f_op->fsync(file, dentry); - up(&inode->i_sem); + fs_up(&inode->i_sem); out_putf: fput(file); @@ -474,6 +475,7 @@ static void remove_from_queues(struct buffer_head * bh) return; } nr_buffers_type[bh->b_list]--; + size_buffers_type[bh->b_list] -= bh->b_size; remove_from_hash_queue(bh); remove_from_lru_list(bh); } @@ -523,6 +525,7 @@ static void insert_into_queues(struct buffer_head * bh) (*bhp)->b_prev_free = bh; nr_buffers_type[bh->b_list]++; + size_buffers_type[bh->b_list] += bh->b_size; /* Put the buffer in new hash-queue if it has a device. */ bh->b_next = NULL; @@ -571,8 +574,10 @@ struct buffer_head * get_hash_table(kdev_t dev, int block, int size) { struct buffer_head * bh; bh = find_buffer(dev,block,size); - if (bh) + if (bh) { bh->b_count++; + touch_buffer(bh); + } return bh; } @@ -816,6 +821,46 @@ static inline void file_buffer(struct buffer_head *bh, int list) insert_into_queues(bh); } +/* -1 -> no need to flush + 0 -> async flush + 1 -> sync flush (wait for I/O completation) */ +static int balance_dirty_state(kdev_t dev) +{ + unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; + + dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; + tot = (buffermem >> PAGE_SHIFT) + nr_free_pages; + tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT; + + dirty *= 200; + soft_dirty_limit = tot * bdf_prm.b_un.nfract; + hard_dirty_limit = soft_dirty_limit * 2; + + if (dirty > soft_dirty_limit) + { + if (dirty > hard_dirty_limit) + return 1; + return 0; + } + return -1; +} + +/* + * if a new dirty buffer is created we need to balance bdflush. + * + * in the future we might want to make bdflush aware of different + * pressures on different devices - thus the (currently unused) + * 'dev' parameter. + */ +void balance_dirty(kdev_t dev) +{ + int state = balance_dirty_state(dev); + + if (state < 0) + return; + wakeup_bdflush(state); +} + /* * A buffer may need to be moved from one buffer list to another * (e.g. in case it is not shared any more). Handle this. @@ -828,7 +873,9 @@ void refile_buffer(struct buffer_head * buf) printk("Attempt to refile free buffer\n"); return; } - if (buffer_dirty(buf)) + if (buffer_protected(buf)) + dispose = BUF_PROTECTED; + else if (buffer_dirty(buf)) dispose = BUF_DIRTY; else if (buffer_locked(buf)) dispose = BUF_LOCKED; @@ -837,13 +884,7 @@ void refile_buffer(struct buffer_head * buf) if(dispose != buf->b_list) { file_buffer(buf, dispose); if(dispose == BUF_DIRTY) { - int too_many = (nr_buffers * bdf_prm.b_un.nfract/100); - - /* This buffer is dirty, maybe we need to start flushing. - * If too high a percentage of the buffers are dirty... - */ - if (nr_buffers_type[BUF_DIRTY] > too_many) - wakeup_bdflush(1); + balance_dirty(buf->b_dev); /* If this is a loop device, and * more than half of the buffers are dirty... @@ -864,7 +905,6 @@ void __brelse(struct buffer_head * buf) /* If dirty, mark the time this buffer should be written back. */ set_writetime(buf, 0); refile_buffer(buf); - touch_buffer(buf); if (buf->b_count) { buf->b_count--; @@ -1457,6 +1497,7 @@ static int grow_buffers(int size) } tmp->b_this_page = bh; free_list[isize] = bh; + mem_map[MAP_NR(page)].flags = 0; mem_map[MAP_NR(page)].buffers = bh; buffermem += PAGE_SIZE; return 1; @@ -1468,33 +1509,34 @@ static int grow_buffers(int size) #define BUFFER_BUSY_BITS ((1<b_count || ((bh)->b_state & BUFFER_BUSY_BITS)) -static int sync_page_buffers(struct page * page, int wait) +static void sync_page_buffers(struct page * page) { - struct buffer_head * bh = page->buffers; - struct buffer_head * tmp = bh; + struct buffer_head * tmp, * bh = page->buffers; + /* + * Here we'll probably sleep and so we must make sure that + * the page doesn't go away from under us. We also prefer any + * concurrent try_to_free_buffers() not to work in any way on + * our current page from under us since we're just working on it. + * As always in 2.2.x we're serialized by the big kernel lock + * during those hacky page-visibility manipulations. + * + * SUBTLE NOTE: for things like LVM snapshotting WRITEA will block too! + */ page->buffers = NULL; + tmp = bh; do { struct buffer_head *p = tmp; tmp = tmp->b_this_page; - if (buffer_locked(p)) { - if (wait) - __wait_on_buffer(p); - } else if (buffer_dirty(p)) - ll_rw_block(WRITE, 1, &p); - } while (tmp != bh); - page->buffers = bh; - - do { - struct buffer_head *p = tmp; - tmp = tmp->b_this_page; - if (buffer_busy(p)) - return 1; + if (buffer_dirty(p)) + if (test_and_set_bit(BH_Wait_IO, &p->b_state)) + ll_rw_block(WRITE, 1, &p); } while (tmp != bh); - return 0; + /* Restore the visibility of the page before returning. */ + page->buffers = bh; } /* @@ -1504,10 +1546,9 @@ static int sync_page_buffers(struct page * page, int wait) * Wake up bdflush() if this fails - if we're running low on memory due * to dirty buffers, we need to flush them out as quickly as possible. */ -int try_to_free_buffers(struct page * page_map, int wait) +int try_to_free_buffers(struct page * page_map, int gfp_mask) { struct buffer_head * tmp, * bh = page_map->buffers; - int too_many; tmp = bh; do { @@ -1516,8 +1557,6 @@ int try_to_free_buffers(struct page * page_map, int wait) tmp = tmp->b_this_page; } while (tmp != bh); - succeed: - tmp = bh; do { struct buffer_head * p = tmp; tmp = tmp->b_this_page; @@ -1536,25 +1575,12 @@ int try_to_free_buffers(struct page * page_map, int wait) return 1; busy: - too_many = (nr_buffers * bdf_prm.b_un.nfract/100); + if (gfp_mask & __GFP_IO) + sync_page_buffers(page_map); - if (!sync_page_buffers(page_map, wait)) { - - /* If a high percentage of the buffers are dirty, - * wake kflushd - */ - if (nr_buffers_type[BUF_DIRTY] > too_many) - wakeup_bdflush(0); - - /* - * We can jump after the busy check because - * we rely on the kernel lock. - */ - goto succeed; - } - - if(nr_buffers_type[BUF_DIRTY] > too_many) + if (balance_dirty_state(NODEV) >= 0) wakeup_bdflush(0); + return 0; } @@ -1566,7 +1592,7 @@ void show_buffers(void) int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0; int protected = 0; int nlist; - static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"}; + static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY","PROTECTED",}; printk("Buffer memory: %8ldkB\n",buffermem>>10); printk("Buffer heads: %6d\n",nr_buffer_heads); @@ -1590,7 +1616,7 @@ void show_buffers(void) used++, lastused = found; bh = bh->b_next_free; } while (bh != lru_list[nlist]); - printk("%8s: %d buffers, %d used (last=%d), " + printk("%9s: %d buffers, %d used (last=%d), " "%d locked, %d protected, %d dirty\n", buf_types[nlist], found, used, lastused, locked, protected, dirty); @@ -1935,7 +1961,8 @@ int bdflush(void * unused) /* If there are still a lot of dirty buffers around, skip the sleep and flush some more */ - if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) { + if (!ndirty || balance_dirty_state(NODEV) < 0) + { spin_lock_irq(¤t->sigmask_lock); flush_signals(current); spin_unlock_irq(¤t->sigmask_lock); diff --git a/fs/coda/file.c b/fs/coda/file.c index 46303344615d..35967edfc9c9 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -190,10 +190,10 @@ static ssize_t coda_file_write(struct file *coda_file, const char *buff, return -1; } - down(&cont_inode->i_sem); + fs_down(&cont_inode->i_sem); result = cont_file.f_op->write(&cont_file , buff, count, &(cont_file.f_pos)); - up(&cont_inode->i_sem); + fs_up(&cont_inode->i_sem); coda_restore_codafile(coda_inode, coda_file, cont_inode, &cont_file); if (result) @@ -228,14 +228,14 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry) coda_prepare_openfile(coda_inode, coda_file, cont_inode, &cont_file, &cont_dentry); - down(&cont_inode->i_sem); + fs_down(&cont_inode->i_sem); result = file_fsync(&cont_file ,&cont_dentry); if ( result == 0 ) { result = venus_fsync(coda_inode->i_sb, &(cnp->c_fid)); } - up(&cont_inode->i_sem); + fs_up(&cont_inode->i_sem); coda_restore_codafile(coda_inode, coda_file, cont_inode, &cont_file); return result; diff --git a/fs/dcache.c b/fs/dcache.c index 0430bb0fdb84..e4265a5ce053 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -475,9 +475,9 @@ void shrink_dcache_parent(struct dentry * parent) */ void shrink_dcache_memory(int priority, unsigned int gfp_mask) { - if (gfp_mask & __GFP_IO) { + if (gfp_mask & __GFP_IO && !current->fs_locks) { int count = 0; - if (priority) + if (priority > 1) count = dentry_stat.nr_unused / priority; prune_dcache(count, -1); } diff --git a/fs/open.c b/fs/open.c index 7a9fa444e48f..9f9354e97196 100644 --- a/fs/open.c +++ b/fs/open.c @@ -73,7 +73,7 @@ int do_truncate(struct dentry *dentry, unsigned long length) if ((off_t) length < 0) return -EINVAL; - down(&inode->i_sem); + fs_down(&inode->i_sem); newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; error = notify_change(dentry, &newattrs); @@ -83,7 +83,7 @@ int do_truncate(struct dentry *dentry, unsigned long length) if (inode->i_op && inode->i_op->truncate) inode->i_op->truncate(inode); } - up(&inode->i_sem); + fs_up(&inode->i_sem); return error; } diff --git a/fs/read_write.c b/fs/read_write.c index e2b5b789977b..56c4fa41125f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -166,9 +166,9 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count) if (!file->f_op || !(write = file->f_op->write)) goto out; - down(&inode->i_sem); + fs_down(&inode->i_sem); ret = write(file, buf, count, &file->f_pos); - up(&inode->i_sem); + fs_up(&inode->i_sem); out: fput(file); bad_file: @@ -314,9 +314,9 @@ asmlinkage ssize_t sys_writev(unsigned long fd, const struct iovec * vector, if (!file) goto bad_file; if (file->f_op && file->f_op->write && (file->f_mode & FMODE_WRITE)) { - down(&file->f_dentry->d_inode->i_sem); + fs_down(&file->f_dentry->d_inode->i_sem); ret = do_readv_writev(VERIFY_READ, file, vector, count); - up(&file->f_dentry->d_inode->i_sem); + fs_up(&file->f_dentry->d_inode->i_sem); } fput(file); @@ -386,9 +386,9 @@ asmlinkage ssize_t sys_pwrite(unsigned int fd, const char * buf, if (pos < 0) goto out; - down(&file->f_dentry->d_inode->i_sem); + fs_down(&file->f_dentry->d_inode->i_sem); ret = write(file, buf, count, &pos); - up(&file->f_dentry->d_inode->i_sem); + fs_up(&file->f_dentry->d_inode->i_sem); out: fput(file); diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d39ed8182d..b7a722c93cdc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -190,6 +190,7 @@ typedef char buffer_block[BLOCK_SIZE]; #define BH_Lock 2 /* 1 if the buffer is locked */ #define BH_Req 3 /* 0 if the buffer has been invalidated */ #define BH_Protected 6 /* 1 if the buffer is protected */ +#define BH_Wait_IO 7 /* 1 if we should throttle on this buffer */ /* * Try to keep the most commonly used fields in single cache lines (16 @@ -782,7 +783,7 @@ extern struct file *inuse_filps; extern void refile_buffer(struct buffer_head * buf); extern void set_writetime(struct buffer_head * buf, int flag); -extern int try_to_free_buffers(struct page *, int wait); +extern int try_to_free_buffers(struct page *, int); extern int nr_buffers; extern long buffermem; @@ -791,15 +792,25 @@ extern int nr_buffer_heads; #define BUF_CLEAN 0 #define BUF_LOCKED 1 /* Buffers scheduled for write */ #define BUF_DIRTY 2 /* Dirty buffers, not yet scheduled for write */ -#define NR_LIST 3 +#define BUF_PROTECTED 3 /* Ramdisk persistent storage */ +#define NR_LIST 4 void mark_buffer_uptodate(struct buffer_head * bh, int on); +extern inline void mark_buffer_protected(struct buffer_head * bh) +{ + if (!test_and_set_bit(BH_Protected, &bh->b_state)) { + if (bh->b_list != BUF_PROTECTED) + refile_buffer(bh); + } +} + extern inline void mark_buffer_clean(struct buffer_head * bh) { if (test_and_clear_bit(BH_Dirty, &bh->b_state)) { if (bh->b_list == BUF_DIRTY) refile_buffer(bh); + clear_bit(BH_Wait_IO, &bh->b_state); } } @@ -941,6 +952,9 @@ extern void inode_setattr(struct inode *, struct iattr *); extern __u32 inode_generation_count; +#define fs_down(sem) do { current->fs_locks++; down(sem); } while (0) +#define fs_up(sem) do { up(sem); current->fs_locks--; } while (0) + #endif /* __KERNEL__ */ #endif diff --git a/include/linux/locks.h b/include/linux/locks.h index 2094a4d19f79..f92fa3788c31 100644 --- a/include/linux/locks.h +++ b/include/linux/locks.h @@ -50,10 +50,12 @@ extern inline void lock_super(struct super_block * sb) if (sb->s_lock) __wait_on_super(sb); sb->s_lock = 1; + current->fs_locks++; } extern inline void unlock_super(struct super_block * sb) { + current->fs_locks--; sb->s_lock = 0; wake_up(&sb->s_wait); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 84c587aca99c..ad89e46aa2f0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -129,12 +129,8 @@ typedef struct page { struct wait_queue *wait; struct page **pprev_hash; struct buffer_head * buffers; - int age; } mem_map_t; -#define PAGE_AGE_INITIAL 1 /* age for pages just mapped */ -#define PAGE_AGE_YOUNG 2 /* age for pages recently referenced */ - /* Page flag bit values */ #define PG_locked 0 #define PG_error 1 diff --git a/include/linux/sched.h b/include/linux/sched.h index 82b10f6ec1d6..fdecb4207d5c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -319,6 +319,8 @@ struct task_struct { struct files_struct *files; /* memory management info */ struct mm_struct *mm; + struct list_head local_pages; int allocation_order, nr_local_pages; + int fs_locks; /* signal handlers */ spinlock_t sigmask_lock; /* Protects signal and blocked */ @@ -351,6 +353,7 @@ struct task_struct { #define PF_SIGNALED 0x00000400 /* killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_VFORK 0x00001000 /* Wake up parent in mm_release */ +#define PF_FREE_PAGES 0x00002000 /* The current-> */ #define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */ #define PF_DTRACE 0x00200000 /* delayed trace (used on m68k, i386) */ @@ -400,7 +403,7 @@ struct task_struct { /* tss */ INIT_TSS, \ /* fs */ &init_fs, \ /* files */ &init_files, \ -/* mm */ &init_mm, \ +/* mm */ &init_mm, { &init_task.local_pages, &init_task.local_pages}, 0, 0, 0, \ /* signals */ SPIN_LOCK_UNLOCKED, &init_signals, {{0}}, {{0}}, NULL, &init_task.sigqueue, 0, 0, \ /* exec cts */ 0,0, \ /* oom */ 0, \ diff --git a/init/main.c b/init/main.c index 5a7fe3c8b7bd..2f4f06e2b733 100644 --- a/init/main.c +++ b/init/main.c @@ -80,7 +80,6 @@ static int init(void *); extern int bdflush(void *); extern int kupdate(void *); extern int kswapd(void *); -extern int kpiod(void *); extern void kswapd_setup(void); extern unsigned long init_IRQ( unsigned long); extern void init_modules(void); @@ -1584,7 +1583,6 @@ static void __init do_basic_setup(void) kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); /* Start the background pageout daemon. */ kswapd_setup(); - kernel_thread(kpiod, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); #if CONFIG_AP1000 diff --git a/ipc/shm.c b/ipc/shm.c index 62dd3c645994..4a33fa8fe850 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -679,7 +679,7 @@ done: /* pte_val(pte) == shp->shm_pages[idx] */ } /* - * Goes through counter = (shm_rss >> prio) present shm pages. + * Goes through counter = (shm_rss / prio) present shm pages. */ static unsigned long swap_id = 0; /* currently being swapped */ static unsigned long swap_idx = 0; /* next to swap */ @@ -693,7 +693,7 @@ int shm_swap (int prio, int gfp_mask) int loop = 0; int counter; - counter = shm_rss >> prio; + counter = shm_rss / prio; if (!counter || !(swap_nr = get_swap_page())) return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 70309a73c7cf..70a98bb198c6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -665,6 +665,8 @@ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; + INIT_LIST_HEAD(&p->local_pages); + retval = -ENOMEM; /* copy all the process information */ if (copy_files(clone_flags, p)) diff --git a/mm/filemap.c b/mm/filemap.c index 384bab05a42e..785135056c64 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -36,25 +35,6 @@ unsigned long page_cache_size = 0; unsigned int page_hash_bits, page_hash_mask; struct page **page_hash_table; -/* - * Define a request structure for outstanding page write requests - * to the background page io daemon - */ - -struct pio_request -{ - struct pio_request * next; - struct file * file; - unsigned long offset; - unsigned long page; -}; -static struct pio_request *pio_first = NULL, **pio_last = &pio_first; -static kmem_cache_t *pio_request_cache; -static struct wait_queue *pio_wait = NULL; - -static inline void -make_pio_request(struct file *, unsigned long, unsigned long); - static inline int sync_page(struct page *page) { struct inode *inode = page->inode; @@ -150,14 +130,21 @@ int shrink_mmap(int priority, int gfp_mask) unsigned long limit = num_physpages; struct page * page; int count; - int nr_dirty = 0; - + /* Make sure we scan all pages twice at priority 0. */ - count = (limit << 1) >> priority; + count = limit / priority; refresh_clock: page = mem_map + clock; do { + int referenced; + + if (current->need_resched) { + current->state = TASK_RUNNING; + schedule(); + goto refresh_clock; + } + /* This works even in the presence of PageSkip because * the first two entries at the beginning of a hole will * be marked, not just the first. @@ -174,42 +161,39 @@ int shrink_mmap(int priority, int gfp_mask) clock = page - mem_map; } - if (test_and_clear_bit(PG_referenced, &page->flags)) { - page->age = PAGE_AGE_YOUNG; - continue; - } - - if (page->age > 0) { - page->age--; - continue; - } + count--; /* We can't free pages unless there's just one user */ if (atomic_read(&page->count) != 1) continue; + referenced = test_and_clear_bit(PG_referenced, &page->flags); + if (PageLocked(page)) continue; - if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) + if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) { + count++; continue; + } - /* Is it a page swap page? Drop it, its old. */ + /* + * Is it a page swap page? If so, we want to + * drop it if it is no longer used, even if it + * were to be marked referenced.. + */ if (PageSwapCache(page)) { + if (referenced && swap_count(page->offset) != 1) + continue; delete_from_swap_cache(page); return 1; } + if (referenced) + continue; + /* Is it a buffer page? */ if (page->buffers) { - /* - * Wait for async IO to complete - * at each 64 buffers - */ - - int wait = ((gfp_mask & __GFP_IO) - && (!(nr_dirty++ % 64))); - if (buffer_under_min()) continue; /* @@ -217,10 +201,8 @@ int shrink_mmap(int priority, int gfp_mask) * throttling. */ - if (!try_to_free_buffers(page, wait)) { - if(--count < 0) break; + if (!try_to_free_buffers(page, gfp_mask)) goto refresh_clock; - } return 1; } @@ -231,8 +213,7 @@ int shrink_mmap(int priority, int gfp_mask) remove_inode_page(page); return 1; } - - } while (--count > 0); + } while (count > 0); return 0; } @@ -299,7 +280,7 @@ static inline void add_to_page_cache(struct page * page, struct page **hash) { atomic_inc(&page->count); - page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced); + page->flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced)); page->offset = offset; add_page_to_inode_queue(inode, page); __add_page_to_hash_queue(page, hash); @@ -878,12 +859,12 @@ static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned if (size > count) size = count; - down(&inode->i_sem); + fs_down(&inode->i_sem); old_fs = get_fs(); set_fs(KERNEL_DS); written = file->f_op->write(file, area, size, &file->f_pos); set_fs(old_fs); - up(&inode->i_sem); + fs_up(&inode->i_sem); if (written < 0) { desc->error = written; written = 0; @@ -1160,8 +1141,7 @@ static inline int do_write_page(struct inode * inode, struct file * file, static int filemap_write_page(struct vm_area_struct * vma, unsigned long offset, - unsigned long page, - int wait) + unsigned long page) { int result; struct file * file; @@ -1179,20 +1159,9 @@ static int filemap_write_page(struct vm_area_struct * vma, * and file could be released ... increment the count to be safe. */ file->f_count++; - - /* - * If this is a swapping operation rather than msync(), then - * leave the actual IO, and the restoration of the file count, - * to the kpiod thread. Just queue the request for now. - */ - if (!wait) { - make_pio_request(file, offset, page); - return 0; - } - - down(&inode->i_sem); + fs_down(&inode->i_sem); result = do_write_page(inode, file, (const char *) page, offset); - up(&inode->i_sem); + fs_up(&inode->i_sem); fput(file); return result; } @@ -1205,7 +1174,7 @@ static int filemap_write_page(struct vm_area_struct * vma, */ int filemap_swapout(struct vm_area_struct * vma, struct page * page) { - return filemap_write_page(vma, page->offset, page_address(page), 0); + return filemap_write_page(vma, page->offset, page_address(page)); } static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, @@ -1242,7 +1211,7 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, return 0; } } - error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1); + error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page); page_cache_free(page); return error; } @@ -1414,9 +1383,9 @@ static int msync_interval(struct vm_area_struct * vma, if (file) { struct dentry * dentry = file->f_dentry; struct inode * inode = dentry->d_inode; - down(&inode->i_sem); + fs_down(&inode->i_sem); error = file_fsync(file, dentry); - up(&inode->i_sem); + fs_up(&inode->i_sem); } } return error; @@ -1745,130 +1714,6 @@ void put_cached_page(unsigned long addr) page_cache_release(page); } - -/* Add request for page IO to the queue */ - -static inline void put_pio_request(struct pio_request *p) -{ - *pio_last = p; - p->next = NULL; - pio_last = &p->next; -} - -/* Take the first page IO request off the queue */ - -static inline struct pio_request * get_pio_request(void) -{ - struct pio_request * p = pio_first; - pio_first = p->next; - if (!pio_first) - pio_last = &pio_first; - return p; -} - -/* Make a new page IO request and queue it to the kpiod thread */ - -static inline void make_pio_request(struct file *file, - unsigned long offset, - unsigned long page) -{ - struct pio_request *p; - - atomic_inc(&page_cache_entry(page)->count); - - /* - * We need to allocate without causing any recursive IO in the - * current thread's context. We might currently be swapping out - * as a result of an allocation made while holding a critical - * filesystem lock. To avoid deadlock, we *MUST* not reenter - * the filesystem in this thread. - * - * We can wait for kswapd to free memory, or we can try to free - * pages without actually performing further IO, without fear of - * deadlock. --sct - */ - - while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) { - if (try_to_free_pages(__GFP_WAIT)) - continue; - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(HZ/10); - } - - p->file = file; - p->offset = offset; - p->page = page; - - put_pio_request(p); - wake_up(&pio_wait); -} - - -/* - * This is the only thread which is allowed to write out filemap pages - * while swapping. - * - * To avoid deadlock, it is important that we never reenter this thread. - * Although recursive memory allocations within this thread may result - * in more page swapping, that swapping will always be done by queuing - * another IO request to the same thread: we will never actually start - * that IO request until we have finished with the current one, and so - * we will not deadlock. - */ - -int kpiod(void * unused) -{ - struct task_struct *tsk = current; - struct wait_queue wait = { tsk, }; - struct inode * inode; - struct dentry * dentry; - struct pio_request * p; - - tsk->session = 1; - tsk->pgrp = 1; - strcpy(tsk->comm, "kpiod"); - sigfillset(&tsk->blocked); - init_waitqueue(&pio_wait); - /* - * Mark this task as a memory allocator - we don't want to get caught - * up in the regular mm freeing frenzy if we have to allocate memory - * in order to write stuff out. - */ - tsk->flags |= PF_MEMALLOC; - - lock_kernel(); - - pio_request_cache = kmem_cache_create("pio_request", - sizeof(struct pio_request), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (!pio_request_cache) - panic ("Could not create pio_request slab cache"); - - while (1) { - tsk->state = TASK_INTERRUPTIBLE; - add_wait_queue(&pio_wait, &wait); - if (!pio_first) - schedule(); - remove_wait_queue(&pio_wait, &wait); - tsk->state = TASK_RUNNING; - - while (pio_first) { - p = get_pio_request(); - dentry = p->file->f_dentry; - inode = dentry->d_inode; - - down(&inode->i_sem); - do_write_page(inode, p->file, - (const char *) p->page, p->offset); - up(&inode->i_sem); - fput(p->file); - page_cache_free(p->page); - kmem_cache_free(pio_request_cache, p); - } - } -} - void __init page_cache_init(unsigned long memory_size) { unsigned long htable_size; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8212c29bb780..533cca3ad461 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -93,34 +93,69 @@ static inline void remove_mem_queue(struct page * entry) */ spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED; +#define list(x) (mem_map+(x)) +#define __free_pages_ok(map_nr, mask, area, index) \ + nr_free_pages -= (mask); \ + while ((mask) + (1 << (NR_MEM_LISTS-1))) { \ + if (!test_and_change_bit((index), (area)->map)) \ + break; \ + (area)->count--; \ + remove_mem_queue(list((map_nr) ^ -(mask))); \ + (mask) <<= 1; \ + (area)++; \ + (index) >>= 1; \ + (map_nr) &= (mask); \ + } \ + add_mem_queue(area, list(map_nr)); + +static void free_local_pages(struct page * page) { + unsigned long order = page->offset; + unsigned int type = PageDMA(page) ? 1 : 0; + struct free_area_struct *area; + unsigned long map_nr = page - mem_map; + unsigned long mask = (~0UL) << order; + unsigned long index = map_nr >> (1 + order); + + area = free_area[type] + order; + __free_pages_ok(map_nr, mask, area, index); +} + static inline void free_pages_ok(unsigned long map_nr, unsigned long order, unsigned type) { - struct free_area_struct *area = free_area[type] + order; - unsigned long index = map_nr >> (1 + order); - unsigned long mask = (~0UL) << order; + struct free_area_struct *area; + unsigned long index; + unsigned long mask; unsigned long flags; + struct page * page; - spin_lock_irqsave(&page_alloc_lock, flags); - -#define list(x) (mem_map+(x)) + if (current->flags & PF_FREE_PAGES) + goto local_freelist; + back_local_freelist: + index = map_nr >> (1 + order); + mask = (~0UL) << order; map_nr &= mask; - nr_free_pages -= mask; - while (mask + (1 << (NR_MEM_LISTS-1))) { - if (!test_and_change_bit(index, area->map)) - break; - area->count--; - remove_mem_queue(list(map_nr ^ -mask)); - mask <<= 1; - area++; - index >>= 1; - map_nr &= mask; - } - add_mem_queue(area, list(map_nr)); - -#undef list + spin_lock_irqsave(&page_alloc_lock, flags); + area = free_area[type] + order; + __free_pages_ok(map_nr, mask, area, index); spin_unlock_irqrestore(&page_alloc_lock, flags); + return; + + local_freelist: + /* + * This is a little subtle: if the allocation order + * wanted is major than zero we'd better take all the pages + * local since we must deal with fragmentation too and we + * can't rely on the nr_local_pages information. + */ + if (current->nr_local_pages && !current->allocation_order) + goto back_local_freelist; + + page = mem_map + map_nr; + list_add((struct list_head *) page, ¤t->local_pages); + page->offset = order; + current->nr_local_pages++; } void __free_pages(struct page *page, unsigned long order) @@ -129,7 +164,6 @@ void __free_pages(struct page *page, unsigned long order) if (PageSwapCache(page)) panic ("Freeing swap cache page"); page->flags &= ~(1 << PG_referenced); - page->age = PAGE_AGE_INITIAL; free_pages_ok(page - mem_map, order, PageDMA(page) ? 1 : 0); return; } @@ -180,13 +214,32 @@ do { unsigned long size = 1 << high; \ atomic_set(&map->count, 1); \ } while (0) +static void refile_local_pages(void) +{ + if (current->nr_local_pages) { + struct page * page; + struct list_head * entry; + int nr_pages = current->nr_local_pages; + + while ((entry = current->local_pages.next) != ¤t->local_pages) { + list_del(entry); + page = (struct page *) entry; + free_local_pages(page); + if (!nr_pages--) + panic("__get_free_pages local_pages list corrupted I"); + } + if (nr_pages) + panic("__get_free_pages local_pages list corrupted II"); + current->nr_local_pages = 0; + } +} + unsigned long __get_free_pages(int gfp_mask, unsigned long order) { unsigned long flags; - static atomic_t free_before_allocate = ATOMIC_INIT(0); if (order >= NR_MEM_LISTS) - goto nopage; + goto out; #ifdef ATOMIC_MEMORY_DEBUGGING if ((gfp_mask & __GFP_WAIT) && in_interrupt()) { @@ -195,26 +248,24 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order) printk("gfp called nonatomically from interrupt %p\n", __builtin_return_address(0)); } - goto nopage; + goto out; } #endif + /* + * Acquire lock before reading nr_free_pages to make sure it + * won't change from under us. + */ + spin_lock_irqsave(&page_alloc_lock, flags); + /* * If this is a recursive call, we'd better * do our best to just allocate things without * further thought. */ if (!(current->flags & PF_MEMALLOC)) { - int freed; extern struct wait_queue * kswapd_wait; - /* Somebody needs to free pages so we free some of our own. */ - if (atomic_read(&free_before_allocate)) { - current->flags |= PF_MEMALLOC; - try_to_free_pages(gfp_mask); - current->flags &= ~PF_MEMALLOC; - } - if (nr_free_pages > freepages.low) goto ok_to_allocate; @@ -224,35 +275,44 @@ unsigned long __get_free_pages(int gfp_mask, unsigned long order) /* Do we have to block or can we proceed? */ if (nr_free_pages > freepages.min) goto ok_to_allocate; - - current->flags |= PF_MEMALLOC; - atomic_inc(&free_before_allocate); - freed = try_to_free_pages(gfp_mask); - atomic_dec(&free_before_allocate); - current->flags &= ~PF_MEMALLOC; - - /* - * Re-check we're still low on memory after we blocked - * for some time. Somebody may have released lots of - * memory from under us while we was trying to free - * the pages. We check against pages_high to be sure - * to succeed only if lots of memory is been released. - */ - if (nr_free_pages > freepages.high) - goto ok_to_allocate; - - if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH))) - goto nopage; + if (gfp_mask & __GFP_WAIT) { + int freed; + /* + * If the task is ok to sleep it's fine also + * if we release irq here. + */ + spin_unlock_irq(&page_alloc_lock); + + current->flags |= PF_MEMALLOC|PF_FREE_PAGES; + current->allocation_order = order; + freed = try_to_free_pages(gfp_mask); + current->flags &= ~(PF_MEMALLOC|PF_FREE_PAGES); + + spin_lock_irq(&page_alloc_lock); + refile_local_pages(); + + /* + * Re-check we're still low on memory after we blocked + * for some time. Somebody may have released lots of + * memory from under us while we was trying to free + * the pages. We check against pages_high to be sure + * to succeed only if lots of memory is been released. + */ + if (nr_free_pages > freepages.high) + goto ok_to_allocate; + + if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH))) + goto nopage; + } } ok_to_allocate: - spin_lock_irqsave(&page_alloc_lock, flags); /* if it's not a dma request, try non-dma first */ if (!(gfp_mask & __GFP_DMA)) RMQUEUE_TYPE(order, 0); RMQUEUE_TYPE(order, 1); + nopage: spin_unlock_irqrestore(&page_alloc_lock, flags); - -nopage: + out: return 0; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 8c5e7176c5aa..7c27e5b338db 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -63,6 +63,7 @@ int add_to_swap_cache(struct page *page, unsigned long entry) return 0; } atomic_inc(&page->count); + page->flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced)); page->inode = &swapper_inode; page->offset = entry; add_page_to_hash_queue(page, &swapper_inode, entry); diff --git a/mm/vmscan.c b/mm/vmscan.c index 86e6b1fe9a87..81ba9a55ba62 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -96,6 +96,9 @@ drop_pte: * some real work in the future in "shrink_mmap()". */ if (!pte_dirty(pte)) { + if (page_map->inode && pgcache_under_min()) + /* unmapping this page would be useless */ + return 0; flush_cache_page(vma, address); pte_clear(page_table); goto drop_pte; @@ -106,7 +109,7 @@ drop_pte: * we cannot do I/O! Avoid recursing on FS * locks etc. */ - if (!(gfp_mask & __GFP_IO)) + if (!(gfp_mask & __GFP_IO) || current->fs_locks) return 0; /* @@ -208,6 +211,8 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * result = try_to_swap_out(tsk, vma, address, pte, gfp_mask); if (result) return result; + if (current->need_resched) + return 2; address += PAGE_SIZE; pte++; } while (address < end); @@ -327,7 +332,7 @@ static int swap_out(unsigned int priority, int gfp_mask) * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ - counter = nr_tasks / (priority+1); + counter = nr_tasks / priority; if (counter < 1) counter = 1; @@ -361,8 +366,13 @@ static int swap_out(unsigned int priority, int gfp_mask) goto out; } - if (swap_out_process(pbest, gfp_mask)) + switch (swap_out_process(pbest, gfp_mask)) { + case 1: return 1; + case 2: + current->state = TASK_RUNNING; + schedule(); + } } out: return 0; @@ -377,11 +387,9 @@ out: * cluster them so that we get good swap-out behaviour. See * the "free_memory()" macro for details. */ -static int do_try_to_free_pages(unsigned int gfp_mask) +int try_to_free_pages(unsigned int gfp_mask) { int priority; - int ret = 0; - int swapcount; int count = SWAP_CLUSTER_MAX; lock_kernel(); @@ -389,41 +397,34 @@ static int do_try_to_free_pages(unsigned int gfp_mask) /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); - priority = 6; + priority = 5; do { while (shrink_mmap(priority, gfp_mask)) { - ret = 1; if (!--count) goto done; } /* Try to get rid of some shared memory pages.. */ - if (gfp_mask & __GFP_IO) { + if (gfp_mask & __GFP_IO && !current->fs_locks) { while (shm_swap(priority, gfp_mask)) { - ret = 1; if (!--count) goto done; } } /* Then, try to page stuff out.. */ - swapcount = count; while (swap_out(priority, gfp_mask)) { - ret = 1; - if (!--swapcount) - break; + if (!--count) + goto done; } shrink_dcache_memory(priority, gfp_mask); - } while (--priority >= 0); + } while (--priority > 0); done: unlock_kernel(); - if (!ret) - printk("VM: do_try_to_free_pages failed for %s...\n", - current->comm); /* Return success if we freed a page. */ - return ret; + return priority > 0; } /* @@ -499,7 +500,7 @@ int kswapd(void *unused) while (nr_free_pages < freepages.high) { - if (do_try_to_free_pages(GFP_KSWAPD)) + if (try_to_free_pages(GFP_KSWAPD)) { if (tsk->need_resched) schedule(); @@ -510,17 +511,3 @@ int kswapd(void *unused) } } } - -/* - * Called by non-kswapd processes when kswapd really cannot - * keep up with the demand for free memory. - */ -int try_to_free_pages(unsigned int gfp_mask) -{ - int retval = 1; - - if (gfp_mask & __GFP_WAIT) - retval = do_try_to_free_pages(gfp_mask); - return retval; -} - -- 2.39.5