VERSION = 2
PATCHLEVEL = 2
SUBLEVEL = 19
-EXTRAVERSION = pre1
+EXTRAVERSION = pre2
ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/)
if (CURRENT->cmd == READ)
memset(CURRENT->buffer, 0, len);
else
- set_bit(BH_Protected, &CURRENT->bh->b_state);
+ mark_buffer_protected(CURRENT->bh);
end_request(1);
goto repeat;
static int dump_write(struct file *file, const void *addr, int nr)
{
int r;
- down(&file->f_dentry->d_inode->i_sem);
+ fs_down(&file->f_dentry->d_inode->i_sem);
r = file->f_op->write(file, addr, nr, &file->f_pos) == nr;
- up(&file->f_dentry->d_inode->i_sem);
+ fs_up(&file->f_dentry->d_inode->i_sem);
return r;
}
static int dump_write(struct file *file, const void *addr, int nr)
{
int r;
- down(&file->f_dentry->d_inode->i_sem);
+ fs_down(&file->f_dentry->d_inode->i_sem);
r = file->f_op->write(file, addr, nr, &file->f_pos) == nr;
- up(&file->f_dentry->d_inode->i_sem);
+ fs_up(&file->f_dentry->d_inode->i_sem);
return r;
}
static int nr_buffers = 0;
static int nr_buffers_type[NR_LIST] = {0,};
+static unsigned long size_buffers_type[NR_LIST];
static int nr_buffer_heads = 0;
static int nr_unused_buffer_heads = 0;
static int nr_hashed_buffers = 0;
goto out_putf;
/* We need to protect against concurrent writers.. */
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
err = file->f_op->fsync(file, dentry);
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
out_putf:
fput(file);
goto out_putf;
/* this needs further work, at the moment it is identical to fsync() */
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
err = file->f_op->fsync(file, dentry);
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
out_putf:
fput(file);
return;
}
nr_buffers_type[bh->b_list]--;
+ size_buffers_type[bh->b_list] -= bh->b_size;
remove_from_hash_queue(bh);
remove_from_lru_list(bh);
}
(*bhp)->b_prev_free = bh;
nr_buffers_type[bh->b_list]++;
+ size_buffers_type[bh->b_list] += bh->b_size;
/* Put the buffer in new hash-queue if it has a device. */
bh->b_next = NULL;
{
struct buffer_head * bh;
bh = find_buffer(dev,block,size);
- if (bh)
+ if (bh) {
bh->b_count++;
+ touch_buffer(bh);
+ }
return bh;
}
insert_into_queues(bh);
}
+/* -1 -> no need to flush
+ 0 -> async flush
+ 1 -> sync flush (wait for I/O completation) */
+static int balance_dirty_state(kdev_t dev)
+{
+ unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
+
+ dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
+ tot = (buffermem >> PAGE_SHIFT) + nr_free_pages;
+ tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
+
+ dirty *= 200;
+ soft_dirty_limit = tot * bdf_prm.b_un.nfract;
+ hard_dirty_limit = soft_dirty_limit * 2;
+
+ if (dirty > soft_dirty_limit)
+ {
+ if (dirty > hard_dirty_limit)
+ return 1;
+ return 0;
+ }
+ return -1;
+}
+
+/*
+ * if a new dirty buffer is created we need to balance bdflush.
+ *
+ * in the future we might want to make bdflush aware of different
+ * pressures on different devices - thus the (currently unused)
+ * 'dev' parameter.
+ */
+void balance_dirty(kdev_t dev)
+{
+ int state = balance_dirty_state(dev);
+
+ if (state < 0)
+ return;
+ wakeup_bdflush(state);
+}
+
/*
* A buffer may need to be moved from one buffer list to another
* (e.g. in case it is not shared any more). Handle this.
printk("Attempt to refile free buffer\n");
return;
}
- if (buffer_dirty(buf))
+ if (buffer_protected(buf))
+ dispose = BUF_PROTECTED;
+ else if (buffer_dirty(buf))
dispose = BUF_DIRTY;
else if (buffer_locked(buf))
dispose = BUF_LOCKED;
if(dispose != buf->b_list) {
file_buffer(buf, dispose);
if(dispose == BUF_DIRTY) {
- int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
-
- /* This buffer is dirty, maybe we need to start flushing.
- * If too high a percentage of the buffers are dirty...
- */
- if (nr_buffers_type[BUF_DIRTY] > too_many)
- wakeup_bdflush(1);
+ balance_dirty(buf->b_dev);
/* If this is a loop device, and
* more than half of the buffers are dirty...
/* If dirty, mark the time this buffer should be written back. */
set_writetime(buf, 0);
refile_buffer(buf);
- touch_buffer(buf);
if (buf->b_count) {
buf->b_count--;
}
tmp->b_this_page = bh;
free_list[isize] = bh;
+ mem_map[MAP_NR(page)].flags = 0;
mem_map[MAP_NR(page)].buffers = bh;
buffermem += PAGE_SIZE;
return 1;
#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
#define buffer_busy(bh) ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
-static int sync_page_buffers(struct page * page, int wait)
+static void sync_page_buffers(struct page * page)
{
- struct buffer_head * bh = page->buffers;
- struct buffer_head * tmp = bh;
+ struct buffer_head * tmp, * bh = page->buffers;
+ /*
+ * Here we'll probably sleep and so we must make sure that
+ * the page doesn't go away from under us. We also prefer any
+ * concurrent try_to_free_buffers() not to work in any way on
+ * our current page from under us since we're just working on it.
+ * As always in 2.2.x we're serialized by the big kernel lock
+ * during those hacky page-visibility manipulations.
+ *
+ * SUBTLE NOTE: for things like LVM snapshotting WRITEA will block too!
+ */
page->buffers = NULL;
+ tmp = bh;
do {
struct buffer_head *p = tmp;
tmp = tmp->b_this_page;
- if (buffer_locked(p)) {
- if (wait)
- __wait_on_buffer(p);
- } else if (buffer_dirty(p))
- ll_rw_block(WRITE, 1, &p);
- } while (tmp != bh);
- page->buffers = bh;
-
- do {
- struct buffer_head *p = tmp;
- tmp = tmp->b_this_page;
- if (buffer_busy(p))
- return 1;
+ if (buffer_dirty(p))
+ if (test_and_set_bit(BH_Wait_IO, &p->b_state))
+ ll_rw_block(WRITE, 1, &p);
} while (tmp != bh);
- return 0;
+ /* Restore the visibility of the page before returning. */
+ page->buffers = bh;
}
/*
* Wake up bdflush() if this fails - if we're running low on memory due
* to dirty buffers, we need to flush them out as quickly as possible.
*/
-int try_to_free_buffers(struct page * page_map, int wait)
+int try_to_free_buffers(struct page * page_map, int gfp_mask)
{
struct buffer_head * tmp, * bh = page_map->buffers;
- int too_many;
tmp = bh;
do {
tmp = tmp->b_this_page;
} while (tmp != bh);
- succeed:
- tmp = bh;
do {
struct buffer_head * p = tmp;
tmp = tmp->b_this_page;
return 1;
busy:
- too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
+ if (gfp_mask & __GFP_IO)
+ sync_page_buffers(page_map);
- if (!sync_page_buffers(page_map, wait)) {
-
- /* If a high percentage of the buffers are dirty,
- * wake kflushd
- */
- if (nr_buffers_type[BUF_DIRTY] > too_many)
- wakeup_bdflush(0);
-
- /*
- * We can jump after the busy check because
- * we rely on the kernel lock.
- */
- goto succeed;
- }
-
- if(nr_buffers_type[BUF_DIRTY] > too_many)
+ if (balance_dirty_state(NODEV) >= 0)
wakeup_bdflush(0);
+
return 0;
}
int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
int protected = 0;
int nlist;
- static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
+ static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY","PROTECTED",};
printk("Buffer memory: %8ldkB\n",buffermem>>10);
printk("Buffer heads: %6d\n",nr_buffer_heads);
used++, lastused = found;
bh = bh->b_next_free;
} while (bh != lru_list[nlist]);
- printk("%8s: %d buffers, %d used (last=%d), "
+ printk("%9s: %d buffers, %d used (last=%d), "
"%d locked, %d protected, %d dirty\n",
buf_types[nlist], found, used, lastused,
locked, protected, dirty);
/* If there are still a lot of dirty buffers around, skip the sleep
and flush some more */
- if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
+ if (!ndirty || balance_dirty_state(NODEV) < 0)
+ {
spin_lock_irq(¤t->sigmask_lock);
flush_signals(current);
spin_unlock_irq(¤t->sigmask_lock);
return -1;
}
- down(&cont_inode->i_sem);
+ fs_down(&cont_inode->i_sem);
result = cont_file.f_op->write(&cont_file , buff, count,
&(cont_file.f_pos));
- up(&cont_inode->i_sem);
+ fs_up(&cont_inode->i_sem);
coda_restore_codafile(coda_inode, coda_file, cont_inode, &cont_file);
if (result)
coda_prepare_openfile(coda_inode, coda_file, cont_inode,
&cont_file, &cont_dentry);
- down(&cont_inode->i_sem);
+ fs_down(&cont_inode->i_sem);
result = file_fsync(&cont_file ,&cont_dentry);
if ( result == 0 ) {
result = venus_fsync(coda_inode->i_sb, &(cnp->c_fid));
}
- up(&cont_inode->i_sem);
+ fs_up(&cont_inode->i_sem);
coda_restore_codafile(coda_inode, coda_file, cont_inode, &cont_file);
return result;
*/
void shrink_dcache_memory(int priority, unsigned int gfp_mask)
{
- if (gfp_mask & __GFP_IO) {
+ if (gfp_mask & __GFP_IO && !current->fs_locks) {
int count = 0;
- if (priority)
+ if (priority > 1)
count = dentry_stat.nr_unused / priority;
prune_dcache(count, -1);
}
if ((off_t) length < 0)
return -EINVAL;
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
newattrs.ia_size = length;
newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
error = notify_change(dentry, &newattrs);
if (inode->i_op && inode->i_op->truncate)
inode->i_op->truncate(inode);
}
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
return error;
}
if (!file->f_op || !(write = file->f_op->write))
goto out;
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
ret = write(file, buf, count, &file->f_pos);
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
out:
fput(file);
bad_file:
if (!file)
goto bad_file;
if (file->f_op && file->f_op->write && (file->f_mode & FMODE_WRITE)) {
- down(&file->f_dentry->d_inode->i_sem);
+ fs_down(&file->f_dentry->d_inode->i_sem);
ret = do_readv_writev(VERIFY_READ, file, vector, count);
- up(&file->f_dentry->d_inode->i_sem);
+ fs_up(&file->f_dentry->d_inode->i_sem);
}
fput(file);
if (pos < 0)
goto out;
- down(&file->f_dentry->d_inode->i_sem);
+ fs_down(&file->f_dentry->d_inode->i_sem);
ret = write(file, buf, count, &pos);
- up(&file->f_dentry->d_inode->i_sem);
+ fs_up(&file->f_dentry->d_inode->i_sem);
out:
fput(file);
#define BH_Lock 2 /* 1 if the buffer is locked */
#define BH_Req 3 /* 0 if the buffer has been invalidated */
#define BH_Protected 6 /* 1 if the buffer is protected */
+#define BH_Wait_IO 7 /* 1 if we should throttle on this buffer */
/*
* Try to keep the most commonly used fields in single cache lines (16
extern void refile_buffer(struct buffer_head * buf);
extern void set_writetime(struct buffer_head * buf, int flag);
-extern int try_to_free_buffers(struct page *, int wait);
+extern int try_to_free_buffers(struct page *, int);
extern int nr_buffers;
extern long buffermem;
#define BUF_CLEAN 0
#define BUF_LOCKED 1 /* Buffers scheduled for write */
#define BUF_DIRTY 2 /* Dirty buffers, not yet scheduled for write */
-#define NR_LIST 3
+#define BUF_PROTECTED 3 /* Ramdisk persistent storage */
+#define NR_LIST 4
void mark_buffer_uptodate(struct buffer_head * bh, int on);
+extern inline void mark_buffer_protected(struct buffer_head * bh)
+{
+ if (!test_and_set_bit(BH_Protected, &bh->b_state)) {
+ if (bh->b_list != BUF_PROTECTED)
+ refile_buffer(bh);
+ }
+}
+
extern inline void mark_buffer_clean(struct buffer_head * bh)
{
if (test_and_clear_bit(BH_Dirty, &bh->b_state)) {
if (bh->b_list == BUF_DIRTY)
refile_buffer(bh);
+ clear_bit(BH_Wait_IO, &bh->b_state);
}
}
extern __u32 inode_generation_count;
+#define fs_down(sem) do { current->fs_locks++; down(sem); } while (0)
+#define fs_up(sem) do { up(sem); current->fs_locks--; } while (0)
+
#endif /* __KERNEL__ */
#endif
if (sb->s_lock)
__wait_on_super(sb);
sb->s_lock = 1;
+ current->fs_locks++;
}
extern inline void unlock_super(struct super_block * sb)
{
+ current->fs_locks--;
sb->s_lock = 0;
wake_up(&sb->s_wait);
}
struct wait_queue *wait;
struct page **pprev_hash;
struct buffer_head * buffers;
- int age;
} mem_map_t;
-#define PAGE_AGE_INITIAL 1 /* age for pages just mapped */
-#define PAGE_AGE_YOUNG 2 /* age for pages recently referenced */
-
/* Page flag bit values */
#define PG_locked 0
#define PG_error 1
struct files_struct *files;
/* memory management info */
struct mm_struct *mm;
+ struct list_head local_pages; int allocation_order, nr_local_pages;
+ int fs_locks;
/* signal handlers */
spinlock_t sigmask_lock; /* Protects signal and blocked */
#define PF_SIGNALED 0x00000400 /* killed by a signal */
#define PF_MEMALLOC 0x00000800 /* Allocating memory */
#define PF_VFORK 0x00001000 /* Wake up parent in mm_release */
+#define PF_FREE_PAGES 0x00002000 /* The current-> */
#define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */
#define PF_DTRACE 0x00200000 /* delayed trace (used on m68k, i386) */
/* tss */ INIT_TSS, \
/* fs */ &init_fs, \
/* files */ &init_files, \
-/* mm */ &init_mm, \
+/* mm */ &init_mm, { &init_task.local_pages, &init_task.local_pages}, 0, 0, 0, \
/* signals */ SPIN_LOCK_UNLOCKED, &init_signals, {{0}}, {{0}}, NULL, &init_task.sigqueue, 0, 0, \
/* exec cts */ 0,0, \
/* oom */ 0, \
extern int bdflush(void *);
extern int kupdate(void *);
extern int kswapd(void *);
-extern int kpiod(void *);
extern void kswapd_setup(void);
extern unsigned long init_IRQ( unsigned long);
extern void init_modules(void);
kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
/* Start the background pageout daemon. */
kswapd_setup();
- kernel_thread(kpiod, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
#if CONFIG_AP1000
}
/*
- * Goes through counter = (shm_rss >> prio) present shm pages.
+ * Goes through counter = (shm_rss / prio) present shm pages.
*/
static unsigned long swap_id = 0; /* currently being swapped */
static unsigned long swap_idx = 0; /* next to swap */
int loop = 0;
int counter;
- counter = shm_rss >> prio;
+ counter = shm_rss / prio;
if (!counter || !(swap_nr = get_swap_page()))
return 0;
p->lock_depth = -1; /* -1 = no lock */
p->start_time = jiffies;
+ INIT_LIST_HEAD(&p->local_pages);
+
retval = -ENOMEM;
/* copy all the process information */
if (copy_files(clone_flags, p))
#include <linux/blkdev.h>
#include <linux/file.h>
#include <linux/swapctl.h>
-#include <linux/slab.h>
#include <linux/init.h>
#include <asm/pgtable.h>
unsigned int page_hash_bits, page_hash_mask;
struct page **page_hash_table;
-/*
- * Define a request structure for outstanding page write requests
- * to the background page io daemon
- */
-
-struct pio_request
-{
- struct pio_request * next;
- struct file * file;
- unsigned long offset;
- unsigned long page;
-};
-static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
-static kmem_cache_t *pio_request_cache;
-static struct wait_queue *pio_wait = NULL;
-
-static inline void
-make_pio_request(struct file *, unsigned long, unsigned long);
-
static inline int sync_page(struct page *page)
{
struct inode *inode = page->inode;
unsigned long limit = num_physpages;
struct page * page;
int count;
- int nr_dirty = 0;
-
+
/* Make sure we scan all pages twice at priority 0. */
- count = (limit << 1) >> priority;
+ count = limit / priority;
refresh_clock:
page = mem_map + clock;
do {
+ int referenced;
+
+ if (current->need_resched) {
+ current->state = TASK_RUNNING;
+ schedule();
+ goto refresh_clock;
+ }
+
/* This works even in the presence of PageSkip because
* the first two entries at the beginning of a hole will
* be marked, not just the first.
clock = page - mem_map;
}
- if (test_and_clear_bit(PG_referenced, &page->flags)) {
- page->age = PAGE_AGE_YOUNG;
- continue;
- }
-
- if (page->age > 0) {
- page->age--;
- continue;
- }
+ count--;
/* We can't free pages unless there's just one user */
if (atomic_read(&page->count) != 1)
continue;
+ referenced = test_and_clear_bit(PG_referenced, &page->flags);
+
if (PageLocked(page))
continue;
- if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
+ if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) {
+ count++;
continue;
+ }
- /* Is it a page swap page? Drop it, its old. */
+ /*
+ * Is it a page swap page? If so, we want to
+ * drop it if it is no longer used, even if it
+ * were to be marked referenced..
+ */
if (PageSwapCache(page)) {
+ if (referenced && swap_count(page->offset) != 1)
+ continue;
delete_from_swap_cache(page);
return 1;
}
+ if (referenced)
+ continue;
+
/* Is it a buffer page? */
if (page->buffers) {
- /*
- * Wait for async IO to complete
- * at each 64 buffers
- */
-
- int wait = ((gfp_mask & __GFP_IO)
- && (!(nr_dirty++ % 64)));
-
if (buffer_under_min())
continue;
/*
* throttling.
*/
- if (!try_to_free_buffers(page, wait)) {
- if(--count < 0) break;
+ if (!try_to_free_buffers(page, gfp_mask))
goto refresh_clock;
- }
return 1;
}
remove_inode_page(page);
return 1;
}
-
- } while (--count > 0);
+ } while (count > 0);
return 0;
}
struct page **hash)
{
atomic_inc(&page->count);
- page->flags = (page->flags & ~((1 << PG_uptodate) | (1 << PG_error))) | (1 << PG_referenced);
+ page->flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
page->offset = offset;
add_page_to_inode_queue(inode, page);
__add_page_to_hash_queue(page, hash);
if (size > count)
size = count;
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
old_fs = get_fs();
set_fs(KERNEL_DS);
written = file->f_op->write(file, area, size, &file->f_pos);
set_fs(old_fs);
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
if (written < 0) {
desc->error = written;
written = 0;
static int filemap_write_page(struct vm_area_struct * vma,
unsigned long offset,
- unsigned long page,
- int wait)
+ unsigned long page)
{
int result;
struct file * file;
* and file could be released ... increment the count to be safe.
*/
file->f_count++;
-
- /*
- * If this is a swapping operation rather than msync(), then
- * leave the actual IO, and the restoration of the file count,
- * to the kpiod thread. Just queue the request for now.
- */
- if (!wait) {
- make_pio_request(file, offset, page);
- return 0;
- }
-
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
result = do_write_page(inode, file, (const char *) page, offset);
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
fput(file);
return result;
}
*/
int filemap_swapout(struct vm_area_struct * vma, struct page * page)
{
- return filemap_write_page(vma, page->offset, page_address(page), 0);
+ return filemap_write_page(vma, page->offset, page_address(page));
}
static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
return 0;
}
}
- error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
+ error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
page_cache_free(page);
return error;
}
if (file) {
struct dentry * dentry = file->f_dentry;
struct inode * inode = dentry->d_inode;
- down(&inode->i_sem);
+ fs_down(&inode->i_sem);
error = file_fsync(file, dentry);
- up(&inode->i_sem);
+ fs_up(&inode->i_sem);
}
}
return error;
page_cache_release(page);
}
-
-/* Add request for page IO to the queue */
-
-static inline void put_pio_request(struct pio_request *p)
-{
- *pio_last = p;
- p->next = NULL;
- pio_last = &p->next;
-}
-
-/* Take the first page IO request off the queue */
-
-static inline struct pio_request * get_pio_request(void)
-{
- struct pio_request * p = pio_first;
- pio_first = p->next;
- if (!pio_first)
- pio_last = &pio_first;
- return p;
-}
-
-/* Make a new page IO request and queue it to the kpiod thread */
-
-static inline void make_pio_request(struct file *file,
- unsigned long offset,
- unsigned long page)
-{
- struct pio_request *p;
-
- atomic_inc(&page_cache_entry(page)->count);
-
- /*
- * We need to allocate without causing any recursive IO in the
- * current thread's context. We might currently be swapping out
- * as a result of an allocation made while holding a critical
- * filesystem lock. To avoid deadlock, we *MUST* not reenter
- * the filesystem in this thread.
- *
- * We can wait for kswapd to free memory, or we can try to free
- * pages without actually performing further IO, without fear of
- * deadlock. --sct
- */
-
- while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
- if (try_to_free_pages(__GFP_WAIT))
- continue;
- current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(HZ/10);
- }
-
- p->file = file;
- p->offset = offset;
- p->page = page;
-
- put_pio_request(p);
- wake_up(&pio_wait);
-}
-
-
-/*
- * This is the only thread which is allowed to write out filemap pages
- * while swapping.
- *
- * To avoid deadlock, it is important that we never reenter this thread.
- * Although recursive memory allocations within this thread may result
- * in more page swapping, that swapping will always be done by queuing
- * another IO request to the same thread: we will never actually start
- * that IO request until we have finished with the current one, and so
- * we will not deadlock.
- */
-
-int kpiod(void * unused)
-{
- struct task_struct *tsk = current;
- struct wait_queue wait = { tsk, };
- struct inode * inode;
- struct dentry * dentry;
- struct pio_request * p;
-
- tsk->session = 1;
- tsk->pgrp = 1;
- strcpy(tsk->comm, "kpiod");
- sigfillset(&tsk->blocked);
- init_waitqueue(&pio_wait);
- /*
- * Mark this task as a memory allocator - we don't want to get caught
- * up in the regular mm freeing frenzy if we have to allocate memory
- * in order to write stuff out.
- */
- tsk->flags |= PF_MEMALLOC;
-
- lock_kernel();
-
- pio_request_cache = kmem_cache_create("pio_request",
- sizeof(struct pio_request),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (!pio_request_cache)
- panic ("Could not create pio_request slab cache");
-
- while (1) {
- tsk->state = TASK_INTERRUPTIBLE;
- add_wait_queue(&pio_wait, &wait);
- if (!pio_first)
- schedule();
- remove_wait_queue(&pio_wait, &wait);
- tsk->state = TASK_RUNNING;
-
- while (pio_first) {
- p = get_pio_request();
- dentry = p->file->f_dentry;
- inode = dentry->d_inode;
-
- down(&inode->i_sem);
- do_write_page(inode, p->file,
- (const char *) p->page, p->offset);
- up(&inode->i_sem);
- fput(p->file);
- page_cache_free(p->page);
- kmem_cache_free(pio_request_cache, p);
- }
- }
-}
-
void __init page_cache_init(unsigned long memory_size)
{
unsigned long htable_size;
*/
spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
+#define list(x) (mem_map+(x))
+#define __free_pages_ok(map_nr, mask, area, index) \
+ nr_free_pages -= (mask); \
+ while ((mask) + (1 << (NR_MEM_LISTS-1))) { \
+ if (!test_and_change_bit((index), (area)->map)) \
+ break; \
+ (area)->count--; \
+ remove_mem_queue(list((map_nr) ^ -(mask))); \
+ (mask) <<= 1; \
+ (area)++; \
+ (index) >>= 1; \
+ (map_nr) &= (mask); \
+ } \
+ add_mem_queue(area, list(map_nr));
+
+static void free_local_pages(struct page * page) {
+ unsigned long order = page->offset;
+ unsigned int type = PageDMA(page) ? 1 : 0;
+ struct free_area_struct *area;
+ unsigned long map_nr = page - mem_map;
+ unsigned long mask = (~0UL) << order;
+ unsigned long index = map_nr >> (1 + order);
+
+ area = free_area[type] + order;
+ __free_pages_ok(map_nr, mask, area, index);
+}
+
static inline void free_pages_ok(unsigned long map_nr, unsigned long order, unsigned type)
{
- struct free_area_struct *area = free_area[type] + order;
- unsigned long index = map_nr >> (1 + order);
- unsigned long mask = (~0UL) << order;
+ struct free_area_struct *area;
+ unsigned long index;
+ unsigned long mask;
unsigned long flags;
+ struct page * page;
- spin_lock_irqsave(&page_alloc_lock, flags);
-
-#define list(x) (mem_map+(x))
+ if (current->flags & PF_FREE_PAGES)
+ goto local_freelist;
+ back_local_freelist:
+ index = map_nr >> (1 + order);
+ mask = (~0UL) << order;
map_nr &= mask;
- nr_free_pages -= mask;
- while (mask + (1 << (NR_MEM_LISTS-1))) {
- if (!test_and_change_bit(index, area->map))
- break;
- area->count--;
- remove_mem_queue(list(map_nr ^ -mask));
- mask <<= 1;
- area++;
- index >>= 1;
- map_nr &= mask;
- }
- add_mem_queue(area, list(map_nr));
-
-#undef list
+ spin_lock_irqsave(&page_alloc_lock, flags);
+ area = free_area[type] + order;
+ __free_pages_ok(map_nr, mask, area, index);
spin_unlock_irqrestore(&page_alloc_lock, flags);
+ return;
+
+ local_freelist:
+ /*
+ * This is a little subtle: if the allocation order
+ * wanted is major than zero we'd better take all the pages
+ * local since we must deal with fragmentation too and we
+ * can't rely on the nr_local_pages information.
+ */
+ if (current->nr_local_pages && !current->allocation_order)
+ goto back_local_freelist;
+
+ page = mem_map + map_nr;
+ list_add((struct list_head *) page, ¤t->local_pages);
+ page->offset = order;
+ current->nr_local_pages++;
}
void __free_pages(struct page *page, unsigned long order)
if (PageSwapCache(page))
panic ("Freeing swap cache page");
page->flags &= ~(1 << PG_referenced);
- page->age = PAGE_AGE_INITIAL;
free_pages_ok(page - mem_map, order, PageDMA(page) ? 1 : 0);
return;
}
atomic_set(&map->count, 1); \
} while (0)
+static void refile_local_pages(void)
+{
+ if (current->nr_local_pages) {
+ struct page * page;
+ struct list_head * entry;
+ int nr_pages = current->nr_local_pages;
+
+ while ((entry = current->local_pages.next) != ¤t->local_pages) {
+ list_del(entry);
+ page = (struct page *) entry;
+ free_local_pages(page);
+ if (!nr_pages--)
+ panic("__get_free_pages local_pages list corrupted I");
+ }
+ if (nr_pages)
+ panic("__get_free_pages local_pages list corrupted II");
+ current->nr_local_pages = 0;
+ }
+}
+
unsigned long __get_free_pages(int gfp_mask, unsigned long order)
{
unsigned long flags;
- static atomic_t free_before_allocate = ATOMIC_INIT(0);
if (order >= NR_MEM_LISTS)
- goto nopage;
+ goto out;
#ifdef ATOMIC_MEMORY_DEBUGGING
if ((gfp_mask & __GFP_WAIT) && in_interrupt()) {
printk("gfp called nonatomically from interrupt %p\n",
__builtin_return_address(0));
}
- goto nopage;
+ goto out;
}
#endif
+ /*
+ * Acquire lock before reading nr_free_pages to make sure it
+ * won't change from under us.
+ */
+ spin_lock_irqsave(&page_alloc_lock, flags);
+
/*
* If this is a recursive call, we'd better
* do our best to just allocate things without
* further thought.
*/
if (!(current->flags & PF_MEMALLOC)) {
- int freed;
extern struct wait_queue * kswapd_wait;
- /* Somebody needs to free pages so we free some of our own. */
- if (atomic_read(&free_before_allocate)) {
- current->flags |= PF_MEMALLOC;
- try_to_free_pages(gfp_mask);
- current->flags &= ~PF_MEMALLOC;
- }
-
if (nr_free_pages > freepages.low)
goto ok_to_allocate;
/* Do we have to block or can we proceed? */
if (nr_free_pages > freepages.min)
goto ok_to_allocate;
-
- current->flags |= PF_MEMALLOC;
- atomic_inc(&free_before_allocate);
- freed = try_to_free_pages(gfp_mask);
- atomic_dec(&free_before_allocate);
- current->flags &= ~PF_MEMALLOC;
-
- /*
- * Re-check we're still low on memory after we blocked
- * for some time. Somebody may have released lots of
- * memory from under us while we was trying to free
- * the pages. We check against pages_high to be sure
- * to succeed only if lots of memory is been released.
- */
- if (nr_free_pages > freepages.high)
- goto ok_to_allocate;
-
- if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
- goto nopage;
+ if (gfp_mask & __GFP_WAIT) {
+ int freed;
+ /*
+ * If the task is ok to sleep it's fine also
+ * if we release irq here.
+ */
+ spin_unlock_irq(&page_alloc_lock);
+
+ current->flags |= PF_MEMALLOC|PF_FREE_PAGES;
+ current->allocation_order = order;
+ freed = try_to_free_pages(gfp_mask);
+ current->flags &= ~(PF_MEMALLOC|PF_FREE_PAGES);
+
+ spin_lock_irq(&page_alloc_lock);
+ refile_local_pages();
+
+ /*
+ * Re-check we're still low on memory after we blocked
+ * for some time. Somebody may have released lots of
+ * memory from under us while we was trying to free
+ * the pages. We check against pages_high to be sure
+ * to succeed only if lots of memory is been released.
+ */
+ if (nr_free_pages > freepages.high)
+ goto ok_to_allocate;
+
+ if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+ goto nopage;
+ }
}
ok_to_allocate:
- spin_lock_irqsave(&page_alloc_lock, flags);
/* if it's not a dma request, try non-dma first */
if (!(gfp_mask & __GFP_DMA))
RMQUEUE_TYPE(order, 0);
RMQUEUE_TYPE(order, 1);
+ nopage:
spin_unlock_irqrestore(&page_alloc_lock, flags);
-
-nopage:
+ out:
return 0;
}
return 0;
}
atomic_inc(&page->count);
+ page->flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
page->inode = &swapper_inode;
page->offset = entry;
add_page_to_hash_queue(page, &swapper_inode, entry);
* some real work in the future in "shrink_mmap()".
*/
if (!pte_dirty(pte)) {
+ if (page_map->inode && pgcache_under_min())
+ /* unmapping this page would be useless */
+ return 0;
flush_cache_page(vma, address);
pte_clear(page_table);
goto drop_pte;
* we cannot do I/O! Avoid recursing on FS
* locks etc.
*/
- if (!(gfp_mask & __GFP_IO))
+ if (!(gfp_mask & __GFP_IO) || current->fs_locks)
return 0;
/*
result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
if (result)
return result;
+ if (current->need_resched)
+ return 2;
address += PAGE_SIZE;
pte++;
} while (address < end);
* Think of swap_cnt as a "shadow rss" - it tells us which process
* we want to page out (always try largest first).
*/
- counter = nr_tasks / (priority+1);
+ counter = nr_tasks / priority;
if (counter < 1)
counter = 1;
goto out;
}
- if (swap_out_process(pbest, gfp_mask))
+ switch (swap_out_process(pbest, gfp_mask)) {
+ case 1:
return 1;
+ case 2:
+ current->state = TASK_RUNNING;
+ schedule();
+ }
}
out:
return 0;
* cluster them so that we get good swap-out behaviour. See
* the "free_memory()" macro for details.
*/
-static int do_try_to_free_pages(unsigned int gfp_mask)
+int try_to_free_pages(unsigned int gfp_mask)
{
int priority;
- int ret = 0;
- int swapcount;
int count = SWAP_CLUSTER_MAX;
lock_kernel();
/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);
- priority = 6;
+ priority = 5;
do {
while (shrink_mmap(priority, gfp_mask)) {
- ret = 1;
if (!--count)
goto done;
}
/* Try to get rid of some shared memory pages.. */
- if (gfp_mask & __GFP_IO) {
+ if (gfp_mask & __GFP_IO && !current->fs_locks) {
while (shm_swap(priority, gfp_mask)) {
- ret = 1;
if (!--count)
goto done;
}
}
/* Then, try to page stuff out.. */
- swapcount = count;
while (swap_out(priority, gfp_mask)) {
- ret = 1;
- if (!--swapcount)
- break;
+ if (!--count)
+ goto done;
}
shrink_dcache_memory(priority, gfp_mask);
- } while (--priority >= 0);
+ } while (--priority > 0);
done:
unlock_kernel();
- if (!ret)
- printk("VM: do_try_to_free_pages failed for %s...\n",
- current->comm);
/* Return success if we freed a page. */
- return ret;
+ return priority > 0;
}
/*
while (nr_free_pages < freepages.high)
{
- if (do_try_to_free_pages(GFP_KSWAPD))
+ if (try_to_free_pages(GFP_KSWAPD))
{
if (tsk->need_resched)
schedule();
}
}
}
-
-/*
- * Called by non-kswapd processes when kswapd really cannot
- * keep up with the demand for free memory.
- */
-int try_to_free_pages(unsigned int gfp_mask)
-{
- int retval = 1;
-
- if (gfp_mask & __GFP_WAIT)
- retval = do_try_to_free_pages(gfp_mask);
- return retval;
-}
-