]> git.neil.brown.name Git - history.git/commitdiff
[XFS] Move bits around to better manage common code. No functional change.
authorRussell Cattelan <cattelan@sgi.com>
Sat, 31 Jan 2004 01:07:59 +0000 (12:07 +1100)
committerChristoph Hellwig <hch@lst.de>
Sat, 31 Jan 2004 01:07:59 +0000 (12:07 +1100)
33 files changed:
fs/xfs/Makefile
fs/xfs/linux/kmem.h [new file with mode: 0644]
fs/xfs/linux/mrlock.c [new file with mode: 0644]
fs/xfs/linux/mrlock.h [new file with mode: 0644]
fs/xfs/linux/mutex.h [new file with mode: 0644]
fs/xfs/linux/sema.h [new file with mode: 0644]
fs/xfs/linux/spin.h [new file with mode: 0644]
fs/xfs/linux/sv.h [new file with mode: 0644]
fs/xfs/linux/time.h [new file with mode: 0644]
fs/xfs/linux/xfs_behavior.c [deleted file]
fs/xfs/linux/xfs_behavior.h [deleted file]
fs/xfs/linux/xfs_buf.c [new file with mode: 0644]
fs/xfs/linux/xfs_buf.h [new file with mode: 0644]
fs/xfs/linux/xfs_iomap.c [deleted file]
fs/xfs/linux/xfs_linux.h
fs/xfs/pagebuf/page_buf.c [deleted file]
fs/xfs/pagebuf/page_buf.h [deleted file]
fs/xfs/support/kmem.h [deleted file]
fs/xfs/support/ktrace.c
fs/xfs/support/ktrace.h
fs/xfs/support/mrlock.c [deleted file]
fs/xfs/support/mrlock.h [deleted file]
fs/xfs/support/mutex.h [deleted file]
fs/xfs/support/sema.h [deleted file]
fs/xfs/support/spin.h [deleted file]
fs/xfs/support/sv.h [deleted file]
fs/xfs/support/time.h [deleted file]
fs/xfs/support/uuid.c
fs/xfs/xfs.h
fs/xfs/xfs_behavior.c [new file with mode: 0644]
fs/xfs/xfs_behavior.h [new file with mode: 0644]
fs/xfs/xfs_buf.h [deleted file]
fs/xfs/xfs_iomap.c [new file with mode: 0644]

index 30c511e81a3ccfee5408cb16221c54d575fe4a0b..5475efd6a2f7e74716367b17681a9021863da25f 100644 (file)
@@ -30,7 +30,7 @@
 # http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
 #
 
-EXTRA_CFLAGS +=         -Ifs/xfs -funsigned-char
+EXTRA_CFLAGS +=         -Ifs/xfs -Ifs/xfs/linux -funsigned-char
 
 ifeq ($(CONFIG_XFS_DEBUG),y)
        EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG -DXFSDEBUG
@@ -83,6 +83,7 @@ xfs-y                         += xfs_alloc.o \
                                   xfs_alloc_btree.o \
                                   xfs_attr.o \
                                   xfs_attr_leaf.o \
+                                  xfs_behavior.o \
                                   xfs_bit.o \
                                   xfs_bmap.o \
                                   xfs_bmap_btree.o \
@@ -106,6 +107,7 @@ xfs-y                               += xfs_alloc.o \
                                   xfs_inode.o \
                                   xfs_inode_item.o \
                                   xfs_iocore.o \
+                                  xfs_iomap.o \
                                   xfs_itable.o \
                                   xfs_dfrag.o \
                                   xfs_log.o \
@@ -126,18 +128,15 @@ xfs-y                             += xfs_alloc.o \
 
 xfs-$(CONFIG_XFS_TRACE)                += xfs_dir2_trace.o
 
-# Objects in pagebuf/
-xfs-y                          += pagebuf/page_buf.o
-
 # Objects in linux/
 xfs-y                          += $(addprefix linux/, \
+                                  mrlock.o \
                                   xfs_aops.o \
-                                  xfs_behavior.o \
+                                  xfs_buf.o \
                                   xfs_file.o \
                                   xfs_fs_subr.o \
                                   xfs_globals.o \
                                   xfs_ioctl.o \
-                                  xfs_iomap.o \
                                   xfs_iops.o \
                                   xfs_lrw.o \
                                   xfs_super.o \
@@ -148,7 +147,6 @@ xfs-y                               += $(addprefix linux/, \
 xfs-y                          += $(addprefix support/, \
                                   debug.o \
                                   move.o \
-                                  mrlock.o \
                                   qsort.o \
                                   uuid.o)
 
diff --git a/fs/xfs/linux/kmem.h b/fs/xfs/linux/kmem.h
new file mode 100644 (file)
index 0000000..a8fb09f
--- /dev/null
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_KMEM_H__
+#define __XFS_SUPPORT_KMEM_H__
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/*
+ * Cutoff point to use vmalloc instead of kmalloc.
+ */
+#define MAX_SLAB_SIZE  0x10000
+
+/*
+ * XFS uses slightly different names for these due to the
+ * IRIX heritage.
+ */
+#define        kmem_zone       kmem_cache_s
+#define kmem_zone_t    kmem_cache_t
+
+#define KM_SLEEP       0x0001
+#define KM_NOSLEEP     0x0002
+#define KM_NOFS                0x0004
+
+typedef unsigned long xfs_pflags_t;
+
+#define PFLAGS_TEST_FSTRANS()          (current->flags & PF_FSTRANS)
+
+#define PFLAGS_SET_FSTRANS(STATEP) do {        \
+       *(STATEP) = current->flags;     \
+       current->flags |= PF_FSTRANS;   \
+} while (0)
+
+#define PFLAGS_RESTORE(STATEP) do {    \
+       current->flags = *(STATEP);     \
+} while (0)
+
+#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \
+       *(NSTATEP) = *(OSTATEP);        \
+} while (0)
+
+/*
+ * XXX get rid of the unconditional  __GFP_NOFAIL by adding
+ * a KM_FAIL flag and using it where we're allowed to fail.
+ */
+static __inline unsigned int
+kmem_flags_convert(int flags)
+{
+       int lflags;
+
+#if DEBUG
+       if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS))) {
+               printk(KERN_WARNING
+                   "XFS: memory allocation with wrong flags (%x)\n", flags);
+               BUG();
+       }
+#endif
+
+       lflags = (flags & KM_NOSLEEP) ? GFP_ATOMIC : (GFP_KERNEL|__GFP_NOFAIL);
+
+       /* avoid recusive callbacks to filesystem during transactions */
+       if (PFLAGS_TEST_FSTRANS())
+               lflags &= ~__GFP_FS;
+
+       return lflags;
+}
+
+static __inline void *
+kmem_alloc(size_t size, int flags)
+{
+       if (unlikely(MAX_SLAB_SIZE < size))
+               /* Avoid doing filesystem sensitive stuff to get this */
+               return __vmalloc(size, kmem_flags_convert(flags), PAGE_KERNEL);
+       return kmalloc(size, kmem_flags_convert(flags));
+}
+
+static __inline void *
+kmem_zalloc(size_t size, int flags)
+{
+       void *ptr = kmem_alloc(size, flags);
+       if (likely(ptr != NULL))
+               memset(ptr, 0, size);
+       return ptr;
+}
+
+static __inline void
+kmem_free(void *ptr, size_t size)
+{
+       if (unlikely((unsigned long)ptr < VMALLOC_START ||
+                    (unsigned long)ptr >= VMALLOC_END))
+               kfree(ptr);
+       else
+               vfree(ptr);
+}
+
+static __inline void *
+kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags)
+{
+       void *new = kmem_alloc(newsize, flags);
+
+       if (likely(ptr != NULL)) {
+               if (likely(new != NULL))
+                       memcpy(new, ptr, min(oldsize, newsize));
+               kmem_free(ptr, oldsize);
+       }
+
+       return new;
+}
+
+static __inline kmem_zone_t *
+kmem_zone_init(int size, char *zone_name)
+{
+       return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL);
+}
+
+static __inline void *
+kmem_zone_alloc(kmem_zone_t *zone, int flags)
+{
+       return kmem_cache_alloc(zone, kmem_flags_convert(flags));
+}
+
+static __inline void *
+kmem_zone_zalloc(kmem_zone_t *zone, int flags)
+{
+       void *ptr = kmem_zone_alloc(zone, flags);
+       if (likely(ptr != NULL))
+               memset(ptr, 0, kmem_cache_size(zone));
+       return ptr;
+}
+
+static __inline void
+kmem_zone_free(kmem_zone_t *zone, void *ptr)
+{
+       kmem_cache_free(zone, ptr);
+}
+
+typedef struct shrinker *kmem_shaker_t;
+typedef int (*kmem_shake_func_t)(int, unsigned int);
+
+static __inline kmem_shaker_t
+kmem_shake_register(kmem_shake_func_t sfunc)
+{
+       return set_shrinker(DEFAULT_SEEKS, sfunc);
+}
+
+static __inline void
+kmem_shake_deregister(kmem_shaker_t shrinker)
+{
+       remove_shrinker(shrinker);
+}
+
+static __inline int
+kmem_shake_allow(unsigned int gfp_mask)
+{
+       return (gfp_mask & __GFP_WAIT);
+}
+
+#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/linux/mrlock.c b/fs/xfs/linux/mrlock.c
new file mode 100644 (file)
index 0000000..5b5dae9
--- /dev/null
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <asm/system.h>
+#include <linux/interrupt.h>
+#include <asm/current.h>
+
+#include "mrlock.h"
+
+
+#if USE_RW_WAIT_QUEUE_SPINLOCK
+# define wq_write_lock write_lock
+#else
+# define wq_write_lock spin_lock
+#endif
+
+/*
+ * We don't seem to need lock_type (only one supported), name, or
+ * sequence. But, XFS will pass it so let's leave them here for now.
+ */
+/* ARGSUSED */
+void
+mrlock_init(mrlock_t *mrp, int lock_type, char *name, long sequence)
+{
+       mrp->mr_count = 0;
+       mrp->mr_reads_waiting = 0;
+       mrp->mr_writes_waiting = 0;
+       init_waitqueue_head(&mrp->mr_readerq);
+       init_waitqueue_head(&mrp->mr_writerq);
+       mrp->mr_lock = SPIN_LOCK_UNLOCKED;
+}
+
+/*
+ * Macros to lock/unlock the mrlock_t.
+ */
+
+#define MRLOCK(m)              spin_lock(&(m)->mr_lock);
+#define MRUNLOCK(m)            spin_unlock(&(m)->mr_lock);
+
+
+/*
+ * lock_wait should never be called in an interrupt thread.
+ *
+ * mrlocks can sleep (i.e. call schedule) and so they can't ever
+ * be called from an interrupt thread.
+ *
+ * threads that wake-up should also never be invoked from interrupt threads.
+ *
+ * But, waitqueue_lock is locked from interrupt threads - and we are
+ * called with interrupts disabled, so it is all OK.
+ */
+
+/* ARGSUSED */
+void
+lock_wait(wait_queue_head_t *q, spinlock_t *lock, int rw)
+{
+       DECLARE_WAITQUEUE( wait, current );
+
+       __set_current_state(TASK_UNINTERRUPTIBLE);
+
+       spin_lock(&q->lock);
+       if (rw) {
+               __add_wait_queue_tail(q, &wait);
+       } else {
+               __add_wait_queue(q, &wait);
+       }
+
+       spin_unlock(&q->lock);
+       spin_unlock(lock);
+
+       schedule();
+
+       spin_lock(&q->lock);
+       __remove_wait_queue(q, &wait);
+       spin_unlock(&q->lock);
+
+       spin_lock(lock);
+
+       /* return with lock held */
+}
+
+/* ARGSUSED */
+void
+mrfree(mrlock_t *mrp)
+{
+}
+
+/* ARGSUSED */
+void
+mrlock(mrlock_t *mrp, int type, int flags)
+{
+       if (type == MR_ACCESS)
+               mraccess(mrp);
+       else
+               mrupdate(mrp);
+}
+
+/* ARGSUSED */
+void
+mraccessf(mrlock_t *mrp, int flags)
+{
+       MRLOCK(mrp);
+       if(mrp->mr_writes_waiting > 0) {
+               mrp->mr_reads_waiting++;
+               lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
+               mrp->mr_reads_waiting--;
+       }
+       while (mrp->mr_count < 0) {
+               mrp->mr_reads_waiting++;
+               lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
+               mrp->mr_reads_waiting--;
+       }
+       mrp->mr_count++;
+       MRUNLOCK(mrp);
+}
+
+/* ARGSUSED */
+void
+mrupdatef(mrlock_t *mrp, int flags)
+{
+       MRLOCK(mrp);
+       while(mrp->mr_count) {
+               mrp->mr_writes_waiting++;
+               lock_wait(&mrp->mr_writerq, &mrp->mr_lock, 1);
+               mrp->mr_writes_waiting--;
+       }
+
+       mrp->mr_count = -1; /* writer on it */
+       MRUNLOCK(mrp);
+}
+
+int
+mrtryaccess(mrlock_t *mrp)
+{
+       MRLOCK(mrp);
+       /*
+        * If anyone is waiting for update access or the lock is held for update
+        * fail the request.
+        */
+       if(mrp->mr_writes_waiting > 0 || mrp->mr_count < 0) {
+               MRUNLOCK(mrp);
+               return 0;
+       }
+       mrp->mr_count++;
+       MRUNLOCK(mrp);
+       return 1;
+}
+
+int
+mrtrypromote(mrlock_t *mrp)
+{
+       MRLOCK(mrp);
+
+       if(mrp->mr_count == 1) { /* We are the only thread with the lock */
+               mrp->mr_count = -1; /* writer on it */
+               MRUNLOCK(mrp);
+               return 1;
+       }
+
+       MRUNLOCK(mrp);
+       return 0;
+}
+
+int
+mrtryupdate(mrlock_t *mrp)
+{
+       MRLOCK(mrp);
+
+       if(mrp->mr_count) {
+               MRUNLOCK(mrp);
+               return 0;
+       }
+
+       mrp->mr_count = -1; /* writer on it */
+       MRUNLOCK(mrp);
+       return 1;
+}
+
+static __inline__ void mrwake(mrlock_t *mrp)
+{
+       /*
+        * First, if the count is now 0, we need to wake-up anyone waiting.
+        */
+       if (!mrp->mr_count) {
+               if (mrp->mr_writes_waiting) {   /* Wake-up first writer waiting */
+                       wake_up(&mrp->mr_writerq);
+               } else if (mrp->mr_reads_waiting) {     /* Wakeup any readers waiting */
+                       wake_up(&mrp->mr_readerq);
+               }
+       }
+}
+
+void
+mraccunlock(mrlock_t *mrp)
+{
+       MRLOCK(mrp);
+       mrp->mr_count--;
+       mrwake(mrp);
+       MRUNLOCK(mrp);
+}
+
+void
+mrunlock(mrlock_t *mrp)
+{
+       MRLOCK(mrp);
+       if (mrp->mr_count < 0) {
+               mrp->mr_count = 0;
+       } else {
+               mrp->mr_count--;
+       }
+       mrwake(mrp);
+       MRUNLOCK(mrp);
+}
+
+int
+ismrlocked(mrlock_t *mrp, int type)    /* No need to lock since info can change */
+{
+       if (type == MR_ACCESS)
+               return (mrp->mr_count > 0); /* Read lock */
+       else if (type == MR_UPDATE)
+               return (mrp->mr_count < 0); /* Write lock */
+       else if (type == (MR_UPDATE | MR_ACCESS))
+               return (mrp->mr_count); /* Any type of lock held */
+       else /* Any waiters */
+               return (mrp->mr_reads_waiting | mrp->mr_writes_waiting);
+}
+
+/*
+ * Demote from update to access. We better be the only thread with the
+ * lock in update mode so it should be easy to set to 1.
+ * Wake-up any readers waiting.
+ */
+
+void
+mrdemote(mrlock_t *mrp)
+{
+       MRLOCK(mrp);
+       mrp->mr_count = 1;
+       if (mrp->mr_reads_waiting) {    /* Wakeup all readers waiting */
+               wake_up(&mrp->mr_readerq);
+       }
+       MRUNLOCK(mrp);
+}
diff --git a/fs/xfs/linux/mrlock.h b/fs/xfs/linux/mrlock.h
new file mode 100644 (file)
index 0000000..b2a7b3a
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_MRLOCK_H__
+#define __XFS_SUPPORT_MRLOCK_H__
+
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+/*
+ * Implement mrlocks on Linux that work for XFS.
+ *
+ * These are sleep locks and not spinlocks. If one wants read/write spinlocks,
+ * use read_lock, write_lock, ... see spinlock.h.
+ */
+
+typedef struct mrlock_s {
+       int                     mr_count;
+       unsigned short          mr_reads_waiting;
+       unsigned short          mr_writes_waiting;
+       wait_queue_head_t       mr_readerq;
+       wait_queue_head_t       mr_writerq;
+       spinlock_t              mr_lock;
+} mrlock_t;
+
+#define MR_ACCESS      1
+#define MR_UPDATE      2
+
+#define MRLOCK_BARRIER         0x1
+#define MRLOCK_ALLOW_EQUAL_PRI 0x8
+
+/*
+ * mraccessf/mrupdatef take flags to be passed in while sleeping;
+ * only PLTWAIT is currently supported.
+ */
+
+extern void    mraccessf(mrlock_t *, int);
+extern void    mrupdatef(mrlock_t *, int);
+extern void     mrlock(mrlock_t *, int, int);
+extern void     mrunlock(mrlock_t *);
+extern void     mraccunlock(mrlock_t *);
+extern int      mrtryupdate(mrlock_t *);
+extern int      mrtryaccess(mrlock_t *);
+extern int     mrtrypromote(mrlock_t *);
+extern void     mrdemote(mrlock_t *);
+
+extern int     ismrlocked(mrlock_t *, int);
+extern void     mrlock_init(mrlock_t *, int type, char *name, long sequence);
+extern void     mrfree(mrlock_t *);
+
+#define mrinit(mrp, name)      mrlock_init(mrp, MRLOCK_BARRIER, name, -1)
+#define mraccess(mrp)          mraccessf(mrp, 0) /* grab for READ/ACCESS */
+#define mrupdate(mrp)          mrupdatef(mrp, 0) /* grab for WRITE/UPDATE */
+#define mrislocked_access(mrp) ((mrp)->mr_count > 0)
+#define mrislocked_update(mrp) ((mrp)->mr_count < 0)
+
+#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/linux/mutex.h b/fs/xfs/linux/mutex.h
new file mode 100644 (file)
index 0000000..0b296bb
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_MUTEX_H__
+#define __XFS_SUPPORT_MUTEX_H__
+
+#include <linux/spinlock.h>
+#include <asm/semaphore.h>
+
+/*
+ * Map the mutex'es from IRIX to Linux semaphores.
+ *
+ * Destroy just simply initializes to -99 which should block all other
+ * callers.
+ */
+#define MUTEX_DEFAULT          0x0
+typedef struct semaphore       mutex_t;
+
+#define mutex_init(lock, type, name)           sema_init(lock, 1)
+#define mutex_destroy(lock)                    sema_init(lock, -99)
+#define mutex_lock(lock, num)                  down(lock)
+#define mutex_trylock(lock)                    (down_trylock(lock) ? 0 : 1)
+#define mutex_unlock(lock)                     up(lock)
+
+#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/linux/sema.h b/fs/xfs/linux/sema.h
new file mode 100644 (file)
index 0000000..30b67b4
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SEMA_H__
+#define __XFS_SUPPORT_SEMA_H__
+
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+/*
+ * sema_t structure just maps to struct semaphore in Linux kernel.
+ */
+
+typedef struct semaphore sema_t;
+
+#define init_sema(sp, val, c, d)       sema_init(sp, val)
+#define initsema(sp, val)              sema_init(sp, val)
+#define initnsema(sp, val, name)       sema_init(sp, val)
+#define psema(sp, b)                   down(sp)
+#define vsema(sp)                      up(sp)
+#define valusema(sp)                   (atomic_read(&(sp)->count))
+#define freesema(sema)
+
+/*
+ * Map cpsema (try to get the sema) to down_trylock. We need to switch
+ * the return values since cpsema returns 1 (acquired) 0 (failed) and
+ * down_trylock returns the reverse 0 (acquired) 1 (failed).
+ */
+
+#define cpsema(sp)                     (down_trylock(sp) ? 0 : 1)
+
+/*
+ * Didn't do cvsema(sp). Not sure how to map this to up/down/...
+ * It does a vsema if the values is < 0 other wise nothing.
+ */
+
+#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux/spin.h b/fs/xfs/linux/spin.h
new file mode 100644 (file)
index 0000000..80a3a6b
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SPIN_H__
+#define __XFS_SUPPORT_SPIN_H__
+
+#include <linux/sched.h>       /* preempt needs this */
+#include <linux/spinlock.h>
+
+/*
+ * Map lock_t from IRIX to Linux spinlocks.
+ *
+ * Note that linux turns on/off spinlocks depending on CONFIG_SMP.
+ * We don't need to worry about SMP or not here.
+ */
+
+#define SPLDECL(s)             unsigned long s
+
+typedef spinlock_t lock_t;
+
+#define spinlock_init(lock, name)      spin_lock_init(lock)
+#define        spinlock_destroy(lock)
+
+static inline unsigned long mutex_spinlock(lock_t *lock)
+{
+       spin_lock(lock);
+       return 0;
+}
+
+/*ARGSUSED*/
+static inline void mutex_spinunlock(lock_t *lock, unsigned long s)
+{
+       spin_unlock(lock);
+}
+
+static inline void nested_spinlock(lock_t *lock)
+{
+       spin_lock(lock);
+}
+
+static inline void nested_spinunlock(lock_t *lock)
+{
+       spin_unlock(lock);
+}
+
+#endif /* __XFS_SUPPORT_SPIN_H__ */
diff --git a/fs/xfs/linux/sv.h b/fs/xfs/linux/sv.h
new file mode 100644 (file)
index 0000000..821d316
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SV_H__
+#define __XFS_SUPPORT_SV_H__
+
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+/*
+ * Synchronisation variables.
+ *
+ * (Parameters "pri", "svf" and "rts" are not implemented)
+ */
+
+typedef struct sv_s {
+       wait_queue_head_t waiters;
+} sv_t;
+
+#define SV_FIFO                0x0             /* sv_t is FIFO type */
+#define SV_LIFO                0x2             /* sv_t is LIFO type */
+#define SV_PRIO                0x4             /* sv_t is PRIO type */
+#define SV_KEYED       0x6             /* sv_t is KEYED type */
+#define SV_DEFAULT      SV_FIFO
+
+
+static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
+                            unsigned long timeout)
+{
+       DECLARE_WAITQUEUE(wait, current);
+
+       add_wait_queue_exclusive(&sv->waiters, &wait);
+       __set_current_state(state);
+       spin_unlock(lock);
+
+       schedule_timeout(timeout);
+
+       remove_wait_queue(&sv->waiters, &wait);
+}
+
+#define init_sv(sv,type,name,flag) \
+       init_waitqueue_head(&(sv)->waiters)
+#define sv_init(sv,flag,name) \
+       init_waitqueue_head(&(sv)->waiters)
+#define sv_destroy(sv) \
+       /*NOTHING*/
+#define sv_wait(sv, pri, lock, s) \
+       _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+#define sv_wait_sig(sv, pri, lock, s)   \
+       _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
+       _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
+#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
+       _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
+#define sv_signal(sv) \
+       wake_up(&(sv)->waiters)
+#define sv_broadcast(sv) \
+       wake_up_all(&(sv)->waiters)
+
+#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux/time.h b/fs/xfs/linux/time.h
new file mode 100644 (file)
index 0000000..109b5c0
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_TIME_H__
+#define __XFS_SUPPORT_TIME_H__
+
+#include <linux/sched.h>
+#include <linux/time.h>
+
+typedef struct timespec timespec_t;
+
+static inline void delay(long ticks)
+{
+       current->state = TASK_UNINTERRUPTIBLE;
+       schedule_timeout(ticks);
+}
+
+static inline void nanotime(struct timespec *tvp)
+{
+       *tvp = CURRENT_TIME;
+}
+
+#endif /* __XFS_SUPPORT_TIME_H__ */
diff --git a/fs/xfs/linux/xfs_behavior.c b/fs/xfs/linux/xfs_behavior.c
deleted file mode 100644 (file)
index 16088e1..0000000
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- *
- */
-#include "xfs.h"
-
-/*
- * Source file used to associate/disassociate behaviors with virtualized
- * objects.  See xfs_behavior.h for more information about behaviors, etc.
- *
- * The implementation is split between functions in this file and macros
- * in xfs_behavior.h.
- */
-
-/*
- * Insert a new behavior descriptor into a behavior chain.
- *
- * The behavior chain is ordered based on the 'position' number which
- * lives in the first field of the ops vector (higher numbers first).
- *
- * Attemps to insert duplicate ops result in an EINVAL return code.
- * Otherwise, return 0 to indicate success.
- */
-int
-bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp)
-{
-       bhv_desc_t      *curdesc, *prev;
-       int             position;
-
-       /*
-        * Validate the position value of the new behavior.
-        */
-       position = BHV_POSITION(bdp);
-       ASSERT(position >= BHV_POSITION_BASE && position <= BHV_POSITION_TOP);
-
-       /*
-        * Find location to insert behavior.  Check for duplicates.
-        */
-       prev = NULL;
-       for (curdesc = bhp->bh_first;
-            curdesc != NULL;
-            curdesc = curdesc->bd_next) {
-
-               /* Check for duplication. */
-               if (curdesc->bd_ops == bdp->bd_ops) {
-                       ASSERT(0);
-                       return EINVAL;
-               }
-
-               /* Find correct position */
-               if (position >= BHV_POSITION(curdesc)) {
-                       ASSERT(position != BHV_POSITION(curdesc));
-                       break;          /* found it */
-               }
-
-               prev = curdesc;
-       }
-
-       if (prev == NULL) {
-               /* insert at front of chain */
-               bdp->bd_next = bhp->bh_first;
-               bhp->bh_first = bdp;
-       } else {
-               /* insert after prev */
-               bdp->bd_next = prev->bd_next;
-               prev->bd_next = bdp;
-       }
-
-       return 0;
-}
-
-/*
- * Remove a behavior descriptor from a position in a behavior chain;
- * the postition is guaranteed not to be the first position.
- * Should only be called by the bhv_remove() macro.
- */
-void
-bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp)
-{
-       bhv_desc_t      *curdesc, *prev;
-
-       ASSERT(bhp->bh_first != NULL);
-       ASSERT(bhp->bh_first->bd_next != NULL);
-
-       prev = bhp->bh_first;
-       for (curdesc = bhp->bh_first->bd_next;
-            curdesc != NULL;
-            curdesc = curdesc->bd_next) {
-
-               if (curdesc == bdp)
-                       break;          /* found it */
-               prev = curdesc;
-       }
-
-       ASSERT(curdesc == bdp);
-       prev->bd_next = bdp->bd_next;   /* remove from after prev */
-}
-
-/*
- * Look for a specific ops vector on the specified behavior chain.
- * Return the associated behavior descriptor.  Or NULL, if not found.
- */
-bhv_desc_t *
-bhv_lookup(bhv_head_t *bhp, void *ops)
-{
-       bhv_desc_t      *curdesc;
-
-       for (curdesc = bhp->bh_first;
-            curdesc != NULL;
-            curdesc = curdesc->bd_next) {
-
-               if (curdesc->bd_ops == ops)
-                       return curdesc;
-       }
-
-       return NULL;
-}
-
-/*
- * Looks for the first behavior within a specified range of positions.
- * Return the associated behavior descriptor.  Or NULL, if none found.
- */
-bhv_desc_t *
-bhv_lookup_range(bhv_head_t *bhp, int low, int high)
-{
-       bhv_desc_t      *curdesc;
-
-       for (curdesc = bhp->bh_first;
-            curdesc != NULL;
-            curdesc = curdesc->bd_next) {
-
-               int     position = BHV_POSITION(curdesc);
-
-               if (position <= high) {
-                       if (position >= low)
-                               return curdesc;
-                       return NULL;
-               }
-       }
-
-       return NULL;
-}
-
-/*
- * Return the base behavior in the chain, or NULL if the chain
- * is empty.
- *
- * The caller has not read locked the behavior chain, so acquire the
- * lock before traversing the chain.
- */
-bhv_desc_t *
-bhv_base(bhv_head_t *bhp)
-{
-       bhv_desc_t      *curdesc;
-
-       for (curdesc = bhp->bh_first;
-            curdesc != NULL;
-            curdesc = curdesc->bd_next) {
-
-               if (curdesc->bd_next == NULL) {
-                       return curdesc;
-               }
-       }
-
-       return NULL;
-}
-
-void
-bhv_head_init(
-       bhv_head_t *bhp,
-       char *name)
-{
-       bhp->bh_first = NULL;
-}
-
-void
-bhv_insert_initial(
-       bhv_head_t *bhp,
-       bhv_desc_t *bdp)
-{
-       ASSERT(bhp->bh_first == NULL);
-       (bhp)->bh_first = bdp;
-}
-
-void
-bhv_head_destroy(
-       bhv_head_t *bhp)
-{
-       ASSERT(bhp->bh_first == NULL);
-}
diff --git a/fs/xfs/linux/xfs_behavior.h b/fs/xfs/linux/xfs_behavior.h
deleted file mode 100644 (file)
index d5ed5a8..0000000
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_BEHAVIOR_H__
-#define __XFS_BEHAVIOR_H__
-
-/*
- * Header file used to associate behaviors with virtualized objects.
- *
- * A virtualized object is an internal, virtualized representation of
- * OS entities such as persistent files, processes, or sockets.  Examples
- * of virtualized objects include vnodes, vprocs, and vsockets.  Often
- * a virtualized object is referred to simply as an "object."
- *
- * A behavior is essentially an implementation layer associated with
- * an object.  Multiple behaviors for an object are chained together,
- * the order of chaining determining the order of invocation.  Each
- * behavior of a given object implements the same set of interfaces
- * (e.g., the VOP interfaces).
- *
- * Behaviors may be dynamically inserted into an object's behavior chain,
- * such that the addition is transparent to consumers that already have
- * references to the object.  Typically, a given behavior will be inserted
- * at a particular location in the behavior chain.  Insertion of new
- * behaviors is synchronized with operations-in-progress (oip's) so that
- * the oip's always see a consistent view of the chain.
- *
- * The term "interpostion" is used to refer to the act of inserting
- * a behavior such that it interposes on (i.e., is inserted in front
- * of) a particular other behavior.  A key example of this is when a
- * system implementing distributed single system image wishes to
- * interpose a distribution layer (providing distributed coherency)
- * in front of an object that is otherwise only accessed locally.
- *
- * Note that the traditional vnode/inode combination is simply a virtualized
- * object that has exactly one associated behavior.
- *
- * Behavior synchronization is logic which is necessary under certain
- * circumstances that there is no conflict between ongoing operations
- * traversing the behavior chain and those dunamically modifying the
- * behavior chain.  Because behavior synchronization adds extra overhead
- * to virtual operation invocation, we want to restrict, as much as
- * we can, the requirement for this extra code, to those situations
- * in which it is truly necessary.
- *
- * Behavior synchronization is needed whenever there's at least one class
- * of object in the system for which:
- * 1) multiple behaviors for a given object are supported,
- * -- AND --
- * 2a) insertion of a new behavior can happen dynamically at any time during
- *     the life of an active object,
- *     -- AND --
- *     3a) insertion of a new behavior needs to synchronize with existing
- *         ops-in-progress.
- *     -- OR --
- *     3b) multiple different behaviors can be dynamically inserted at
- *         any time during the life of an active object
- *     -- OR --
- *     3c) removal of a behavior can occur at any time during the life of
- *         an active object.
- * -- OR --
- * 2b) removal of a behavior can occur at any time during the life of an
- *     active object
- *
- */
-
-struct bhv_head_lock;
-
-/*
- * Behavior head.  Head of the chain of behaviors.
- * Contained within each virtualized object data structure.
- */
-typedef struct bhv_head {
-       struct bhv_desc *bh_first;      /* first behavior in chain */
-       struct bhv_head_lock *bh_lockp; /* pointer to lock info struct */
-} bhv_head_t;
-
-/*
- * Behavior descriptor.         Descriptor associated with each behavior.
- * Contained within the behavior's private data structure.
- */
-typedef struct bhv_desc {
-       void            *bd_pdata;      /* private data for this behavior */
-       void            *bd_vobj;       /* virtual object associated with */
-       void            *bd_ops;        /* ops for this behavior */
-       struct bhv_desc *bd_next;       /* next behavior in chain */
-} bhv_desc_t;
-
-/*
- * Behavior identity field.  A behavior's identity determines the position
- * where it lives within a behavior chain, and it's always the first field
- * of the behavior's ops vector. The optional id field further identifies the
- * subsystem responsible for the behavior.
- */
-typedef struct bhv_identity {
-       __u16   bi_id;          /* owning subsystem id */
-       __u16   bi_position;    /* position in chain */
-} bhv_identity_t;
-
-typedef bhv_identity_t bhv_position_t;
-
-#define BHV_IDENTITY_INIT(id,pos)      {id, pos}
-#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos)
-
-/*
- * Define boundaries of position values.
- */
-#define BHV_POSITION_INVALID   0       /* invalid position number */
-#define BHV_POSITION_BASE      1       /* base (last) implementation layer */
-#define BHV_POSITION_TOP       63      /* top (first) implementation layer */
-
-/*
- * Plumbing macros.
- */
-#define BHV_HEAD_FIRST(bhp)    (ASSERT((bhp)->bh_first), (bhp)->bh_first)
-#define BHV_NEXT(bdp)          (ASSERT((bdp)->bd_next), (bdp)->bd_next)
-#define BHV_NEXTNULL(bdp)      ((bdp)->bd_next)
-#define BHV_VOBJ(bdp)          (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj)
-#define BHV_VOBJNULL(bdp)      ((bdp)->bd_vobj)
-#define BHV_PDATA(bdp)         (bdp)->bd_pdata
-#define BHV_OPS(bdp)           (bdp)->bd_ops
-#define BHV_IDENTITY(bdp)      ((bhv_identity_t *)(bdp)->bd_ops)
-#define BHV_POSITION(bdp)      (BHV_IDENTITY(bdp)->bi_position)
-
-extern void bhv_head_init(bhv_head_t *, char *);
-extern void bhv_head_destroy(bhv_head_t *);
-extern int  bhv_insert(bhv_head_t *, bhv_desc_t *);
-extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *);
-
-/*
- * Initialize a new behavior descriptor.
- * Arguments:
- *   bdp - pointer to behavior descriptor
- *   pdata - pointer to behavior's private data
- *   vobj - pointer to associated virtual object
- *   ops - pointer to ops for this behavior
- */
-#define bhv_desc_init(bdp, pdata, vobj, ops)           \
- {                                                     \
-       (bdp)->bd_pdata = pdata;                        \
-       (bdp)->bd_vobj = vobj;                          \
-       (bdp)->bd_ops = ops;                            \
-       (bdp)->bd_next = NULL;                          \
- }
-
-/*
- * Remove a behavior descriptor from a behavior chain.
- */
-#define bhv_remove(bhp, bdp)                           \
- {                                                     \
-       if ((bhp)->bh_first == (bdp)) {                 \
-               /*                                      \
-               * Remove from front of chain.           \
-               * Atomic wrt oip's.                     \
-               */                                      \
-              (bhp)->bh_first = (bdp)->bd_next;        \
-       } else {                                        \
-              /* remove from non-front of chain */     \
-              bhv_remove_not_first(bhp, bdp);          \
-       }                                               \
-       (bdp)->bd_vobj = NULL;                          \
- }
-
-/*
- * Behavior module prototypes.
- */
-extern void            bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp);
-extern bhv_desc_t *    bhv_lookup(bhv_head_t *bhp, void *ops);
-extern bhv_desc_t *    bhv_lookup_range(bhv_head_t *bhp, int low, int high);
-extern bhv_desc_t *    bhv_base(bhv_head_t *bhp);
-
-/* No bhv locking on Linux */
-#define bhv_lookup_unlocked    bhv_lookup
-#define bhv_base_unlocked      bhv_base
-
-#endif /* __XFS_BEHAVIOR_H__ */
diff --git a/fs/xfs/linux/xfs_buf.c b/fs/xfs/linux/xfs_buf.c
new file mode 100644 (file)
index 0000000..891fa44
--- /dev/null
@@ -0,0 +1,2110 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ *     page_buf.c
+ *
+ *     The page_buf module provides an abstract buffer cache model on top of
+ *     the Linux page cache.  Cached metadata blocks for a file system are
+ *     hashed to the inode for the block device.  The page_buf module
+ *     assembles buffer (page_buf_t) objects on demand to aggregate such
+ *     cached pages for I/O.
+ *
+ *
+ *      Written by Steve Lord, Jim Mostek, Russell Cattelan
+ *                 and Rajagopal Ananthanarayanan ("ananth") at SGI.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/suspend.h>
+#include <linux/percpu.h>
+
+#include <support/ktrace.h>
+#include <support/debug.h>
+#include "kmem.h"
+
+#include "xfs_types.h"
+#include "xfs_cred.h"
+#include "xfs_lrw.h"
+#include "xfs_buf.h"
+
+#define BBSHIFT                9
+#define BN_ALIGN_MASK  ((1 << (PAGE_CACHE_SHIFT - BBSHIFT)) - 1)
+
+#ifndef GFP_READAHEAD
+#define GFP_READAHEAD  (__GFP_NOWARN|__GFP_NORETRY)
+#endif
+
+/*
+ * File wide globals
+ */
+
+STATIC kmem_cache_t *pagebuf_cache;
+STATIC void pagebuf_daemon_wakeup(int);
+STATIC void pagebuf_delwri_queue(page_buf_t *, int);
+STATIC struct workqueue_struct *pagebuf_logio_workqueue;
+STATIC struct workqueue_struct *pagebuf_dataio_workqueue;
+
+/*
+ * Pagebuf module configuration parameters, exported via
+ * /proc/sys/vm/pagebuf
+ */
+
+typedef struct pb_sysctl_val {
+       int     min;
+       int     val;
+       int     max;
+} pb_sysctl_val_t;
+
+struct {
+       pb_sysctl_val_t flush_interval; /* interval between runs of the
+                                        * delwri flush daemon.  */
+       pb_sysctl_val_t age_buffer;     /* time for buffer to age before
+                                        * we flush it.  */
+       pb_sysctl_val_t stats_clear;    /* clear the pagebuf stats */
+       pb_sysctl_val_t debug;          /* debug tracing on or off */
+} pb_params = {
+                         /*    MIN     DFLT    MAX     */
+       .flush_interval = {     HZ/2,   HZ,     30*HZ   },
+       .age_buffer     = {     1*HZ,   15*HZ,  300*HZ  },
+       .stats_clear    = {     0,      0,      1       },
+       .debug          = {     0,      0,      1       },
+};
+
+enum {
+       PB_FLUSH_INT = 1,
+       PB_FLUSH_AGE = 2,
+       PB_STATS_CLEAR = 3,
+       PB_DEBUG = 4,
+};
+
+/*
+ * Pagebuf statistics variables
+ */
+
+struct pbstats {
+       u_int32_t       pb_get;
+       u_int32_t       pb_create;
+       u_int32_t       pb_get_locked;
+       u_int32_t       pb_get_locked_waited;
+       u_int32_t       pb_busy_locked;
+       u_int32_t       pb_miss_locked;
+       u_int32_t       pb_page_retries;
+       u_int32_t       pb_page_found;
+       u_int32_t       pb_get_read;
+} pbstats;
+DEFINE_PER_CPU(struct pbstats, pbstats);
+
+/* We don't disable preempt, not too worried about poking the
+ * wrong cpu's stat for now */
+#define PB_STATS_INC(count)    (__get_cpu_var(pbstats).count++)
+
+/*
+ * Pagebuf debugging
+ */
+
+#ifdef PAGEBUF_TRACE
+void
+pagebuf_trace(
+       page_buf_t      *pb,
+       char            *id,
+       void            *data,
+       void            *ra)
+{
+       if (!pb_params.debug.val)
+               return;
+       ktrace_enter(pagebuf_trace_buf,
+               pb, id,
+               (void *)(unsigned long)pb->pb_flags,
+               (void *)(unsigned long)pb->pb_hold.counter,
+               (void *)(unsigned long)pb->pb_sema.count.counter,
+               (void *)current,
+               data, ra,
+               (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
+               (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
+               (void *)(unsigned long)pb->pb_buffer_length,
+               NULL, NULL, NULL, NULL, NULL);
+}
+ktrace_t *pagebuf_trace_buf;
+EXPORT_SYMBOL(pagebuf_trace_buf);
+#define PAGEBUF_TRACE_SIZE     4096
+#define PB_TRACE(pb, id, data) \
+       pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))
+#else
+#define PB_TRACE(pb, id, data) do { } while (0)
+#endif
+
+#ifdef PAGEBUF_LOCK_TRACKING
+# define PB_SET_OWNER(pb)      ((pb)->pb_last_holder = current->pid)
+# define PB_CLEAR_OWNER(pb)    ((pb)->pb_last_holder = -1)
+# define PB_GET_OWNER(pb)      ((pb)->pb_last_holder)
+#else
+# define PB_SET_OWNER(pb)      do { } while (0)
+# define PB_CLEAR_OWNER(pb)    do { } while (0)
+# define PB_GET_OWNER(pb)      do { } while (0)
+#endif
+
+/*
+ * Pagebuf allocation / freeing.
+ */
+
+#define pb_to_gfp(flags) \
+       (((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \
+        ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL)
+
+#define pagebuf_allocate(flags) \
+       kmem_cache_alloc(pagebuf_cache, pb_to_gfp(flags))
+#define pagebuf_deallocate(pb) \
+       kmem_cache_free(pagebuf_cache, (pb));
+
+/*
+ * Pagebuf hashing
+ */
+
+#define NBITS  8
+#define NHASH  (1<<NBITS)
+
+typedef struct {
+       struct list_head        pb_hash;
+       int                     pb_count;
+       spinlock_t              pb_hash_lock;
+} pb_hash_t;
+
+STATIC pb_hash_t       pbhash[NHASH];
+#define pb_hash(pb)    &pbhash[pb->pb_hash_index]
+
+STATIC int
+_bhash(
+       struct block_device *bdev,
+       loff_t          base)
+{
+       int             bit, hval;
+
+       base >>= 9;
+       base ^= (unsigned long)bdev / L1_CACHE_BYTES;
+       for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) {
+               hval ^= (int)base & (NHASH-1);
+               base >>= NBITS;
+       }
+       return hval;
+}
+
+/*
+ * Mapping of multi-page buffers into contiguous virtual space
+ */
+
+STATIC void *pagebuf_mapout_locked(page_buf_t *);
+
+typedef struct a_list {
+       void            *vm_addr;
+       struct a_list   *next;
+} a_list_t;
+
+STATIC a_list_t                *as_free_head;
+STATIC int             as_list_len;
+STATIC spinlock_t      as_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Try to batch vunmaps because they are costly.
+ */
+STATIC void
+free_address(
+       void            *addr)
+{
+       a_list_t        *aentry;
+
+       aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC);
+       if (aentry) {
+               spin_lock(&as_lock);
+               aentry->next = as_free_head;
+               aentry->vm_addr = addr;
+               as_free_head = aentry;
+               as_list_len++;
+               spin_unlock(&as_lock);
+       } else {
+               vunmap(addr);
+       }
+}
+
+STATIC void
+purge_addresses(void)
+{
+       a_list_t        *aentry, *old;
+
+       if (as_free_head == NULL)
+               return;
+
+       spin_lock(&as_lock);
+       aentry = as_free_head;
+       as_free_head = NULL;
+       as_list_len = 0;
+       spin_unlock(&as_lock);
+
+       while ((old = aentry) != NULL) {
+               vunmap(aentry->vm_addr);
+               aentry = aentry->next;
+               kfree(old);
+       }
+}
+
+/*
+ *     Internal pagebuf object manipulation
+ */
+
+STATIC void
+_pagebuf_initialize(
+       page_buf_t              *pb,
+       pb_target_t             *target,
+       loff_t                  range_base,
+       size_t                  range_length,
+       page_buf_flags_t        flags)
+{
+       /*
+        * We don't want certain flags to appear in pb->pb_flags.
+        */
+       flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD);
+
+       memset(pb, 0, sizeof(page_buf_t));
+       atomic_set(&pb->pb_hold, 1);
+       init_MUTEX_LOCKED(&pb->pb_iodonesema);
+       INIT_LIST_HEAD(&pb->pb_list);
+       INIT_LIST_HEAD(&pb->pb_hash_list);
+       init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
+       PB_SET_OWNER(pb);
+       pb->pb_target = target;
+       pb->pb_file_offset = range_base;
+       /*
+        * Set buffer_length and count_desired to the same value initially.
+        * IO routines should use count_desired, which will be the same in
+        * most cases but may be reset (e.g. XFS recovery).
+        */
+       pb->pb_buffer_length = pb->pb_count_desired = range_length;
+       pb->pb_flags = flags | PBF_NONE;
+       pb->pb_bn = PAGE_BUF_DADDR_NULL;
+       atomic_set(&pb->pb_pin_count, 0);
+       init_waitqueue_head(&pb->pb_waiters);
+
+       PB_STATS_INC(pb_create);
+       PB_TRACE(pb, "initialize", target);
+}
+
+/*
+ * Allocate a page array capable of holding a specified number
+ * of pages, and point the page buf at it.
+ */
+STATIC int
+_pagebuf_get_pages(
+       page_buf_t              *pb,
+       int                     page_count,
+       page_buf_flags_t        flags)
+{
+       int                     gpf_mask = pb_to_gfp(flags);
+
+       /* Make sure that we have a page list */
+       if (pb->pb_pages == NULL) {
+               pb->pb_offset = page_buf_poff(pb->pb_file_offset);
+               pb->pb_page_count = page_count;
+               if (page_count <= PB_PAGES) {
+                       pb->pb_pages = pb->pb_page_array;
+               } else {
+                       pb->pb_pages = kmalloc(sizeof(struct page *) *
+                                       page_count, gpf_mask);
+                       if (pb->pb_pages == NULL)
+                               return -ENOMEM;
+               }
+               memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
+       }
+       return 0;
+}
+
+/*
+ * Walk a pagebuf releasing all the pages contained within it.
+ */
+STATIC inline void
+_pagebuf_freepages(
+       page_buf_t              *pb)
+{
+       int                     buf_index;
+
+       for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) {
+               struct page     *page = pb->pb_pages[buf_index];
+
+               if (page) {
+                       pb->pb_pages[buf_index] = NULL;
+                       page_cache_release(page);
+               }
+       }
+}
+
+/*
+ *     _pagebuf_free_object
+ *
+ *     _pagebuf_free_object releases the contents specified buffer.
+ *     The modification state of any associated pages is left unchanged.
+ */
+void
+_pagebuf_free_object(
+       pb_hash_t               *hash,  /* hash bucket for buffer */
+       page_buf_t              *pb)    /* buffer to deallocate */
+{
+       page_buf_flags_t        pb_flags = pb->pb_flags;
+
+       PB_TRACE(pb, "free_object", 0);
+       pb->pb_flags |= PBF_FREED;
+
+       if (hash) {
+               if (!list_empty(&pb->pb_hash_list)) {
+                       hash->pb_count--;
+                       list_del_init(&pb->pb_hash_list);
+               }
+               spin_unlock(&hash->pb_hash_lock);
+       }
+
+       if (!(pb_flags & PBF_FREED)) {
+               /* release any virtual mapping */ ;
+               if (pb->pb_flags & _PBF_ADDR_ALLOCATED) {
+                       void *vaddr = pagebuf_mapout_locked(pb);
+                       if (vaddr) {
+                               free_address(vaddr);
+                       }
+               }
+
+               if (pb->pb_flags & _PBF_MEM_ALLOCATED) {
+                       if (pb->pb_pages) {
+                               /* release the pages in the address list */
+                               if ((pb->pb_pages[0]) &&
+                                   (pb->pb_flags & _PBF_MEM_SLAB)) {
+                                       kfree(pb->pb_addr);
+                               } else {
+                                       _pagebuf_freepages(pb);
+                               }
+                               if (pb->pb_pages != pb->pb_page_array)
+                                       kfree(pb->pb_pages);
+                               pb->pb_pages = NULL;
+                       }
+                       pb->pb_flags &= ~(_PBF_MEM_ALLOCATED|_PBF_MEM_SLAB);
+               }
+       }
+
+       pagebuf_deallocate(pb);
+}
+
+/*
+ *     _pagebuf_lookup_pages
+ *
+ *     _pagebuf_lookup_pages finds all pages which match the buffer
+ *     in question and the range of file offsets supplied,
+ *     and builds the page list for the buffer, if the
+ *     page list is not already formed or if not all of the pages are
+ *     already in the list. Invalid pages (pages which have not yet been
+ *     read in from disk) are assigned for any pages which are not found.
+ */
+STATIC int
+_pagebuf_lookup_pages(
+       page_buf_t              *pb,
+       struct address_space    *aspace,
+       page_buf_flags_t        flags)
+{
+       loff_t                  next_buffer_offset;
+       unsigned long           page_count, pi, index;
+       struct page             *page;
+       int                     gfp_mask, retry_count = 5, rval = 0;
+       int                     all_mapped, good_pages, nbytes;
+       unsigned int            blocksize, sectorshift;
+       size_t                  size, offset;
+
+
+       /* For pagebufs where we want to map an address, do not use
+        * highmem pages - so that we do not need to use kmap resources
+        * to access the data.
+        *
+        * For pages where the caller has indicated there may be resource
+        * contention (e.g. called from a transaction) do not flush
+        * delalloc pages to obtain memory.
+        */
+
+       if (flags & PBF_READ_AHEAD) {
+               gfp_mask = GFP_READAHEAD;
+               retry_count = 0;
+       } else if (flags & PBF_DONT_BLOCK) {
+               gfp_mask = GFP_NOFS;
+       } else if (flags & PBF_MAPPABLE) {
+               gfp_mask = GFP_KERNEL;
+       } else {
+               gfp_mask = GFP_HIGHUSER;
+       }
+
+       next_buffer_offset = pb->pb_file_offset + pb->pb_buffer_length;
+
+       good_pages = page_count = (page_buf_btoc(next_buffer_offset) -
+                                  page_buf_btoct(pb->pb_file_offset));
+
+       if (pb->pb_flags & _PBF_ALL_PAGES_MAPPED) {
+               /* Bring pages forward in cache */
+               for (pi = 0; pi < page_count; pi++) {
+                       mark_page_accessed(pb->pb_pages[pi]);
+               }
+               if ((flags & PBF_MAPPED) && !(pb->pb_flags & PBF_MAPPED)) {
+                       all_mapped = 1;
+                       goto mapit;
+               }
+               return 0;
+       }
+
+       /* Ensure pb_pages field has been initialised */
+       rval = _pagebuf_get_pages(pb, page_count, flags);
+       if (rval)
+               return rval;
+
+       rval = pi = 0;
+       blocksize = pb->pb_target->pbr_bsize;
+       sectorshift = pb->pb_target->pbr_sshift;
+       size = pb->pb_count_desired;
+       offset = pb->pb_offset;
+
+       /* Enter the pages in the page list */
+       index = (pb->pb_file_offset - pb->pb_offset) >> PAGE_CACHE_SHIFT;
+       for (all_mapped = 1; pi < page_count; pi++, index++) {
+               if (pb->pb_pages[pi] == 0) {
+                     retry:
+                       page = find_or_create_page(aspace, index, gfp_mask);
+                       if (!page) {
+                               if (--retry_count > 0) {
+                                       PB_STATS_INC(pb_page_retries);
+                                       pagebuf_daemon_wakeup(1);
+                                       current->state = TASK_UNINTERRUPTIBLE;
+                                       schedule_timeout(10);
+                                       goto retry;
+                               }
+                               rval = -ENOMEM;
+                               all_mapped = 0;
+                               continue;
+                       }
+                       PB_STATS_INC(pb_page_found);
+                       mark_page_accessed(page);
+                       pb->pb_pages[pi] = page;
+               } else {
+                       page = pb->pb_pages[pi];
+                       lock_page(page);
+               }
+
+               nbytes = PAGE_CACHE_SIZE - offset;
+               if (nbytes > size)
+                       nbytes = size;
+               size -= nbytes;
+
+               if (!PageUptodate(page)) {
+                       if (blocksize == PAGE_CACHE_SIZE) {
+                               if (flags & PBF_READ)
+                                       pb->pb_locked = 1;
+                               good_pages--;
+                       } else if (!PagePrivate(page)) {
+                               unsigned long   i, range;
+
+                               /*
+                                * In this case page->private holds a bitmap
+                                * of uptodate sectors within the page
+                                */
+                               ASSERT(blocksize < PAGE_CACHE_SIZE);
+                               range = (offset + nbytes) >> sectorshift;
+                               for (i = offset >> sectorshift; i < range; i++)
+                                       if (!test_bit(i, &page->private))
+                                               break;
+                               if (i != range)
+                                       good_pages--;
+                       } else {
+                               good_pages--;
+                       }
+               }
+               offset = 0;
+       }
+
+       if (!pb->pb_locked) {
+               for (pi = 0; pi < page_count; pi++) {
+                       if (pb->pb_pages[pi])
+                               unlock_page(pb->pb_pages[pi]);
+               }
+       }
+
+mapit:
+       pb->pb_flags |= _PBF_MEM_ALLOCATED;
+       if (all_mapped) {
+               pb->pb_flags |= _PBF_ALL_PAGES_MAPPED;
+
+               /* A single page buffer is always mappable */
+               if (page_count == 1) {
+                       pb->pb_addr = (caddr_t)
+                               page_address(pb->pb_pages[0]) + pb->pb_offset;
+                       pb->pb_flags |= PBF_MAPPED;
+               } else if (flags & PBF_MAPPED) {
+                       if (as_list_len > 64)
+                               purge_addresses();
+                       pb->pb_addr = vmap(pb->pb_pages, page_count,
+                                       VM_MAP, PAGE_KERNEL);
+                       if (pb->pb_addr == NULL)
+                               return -ENOMEM;
+                       pb->pb_addr += pb->pb_offset;
+                       pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED;
+               }
+       }
+       /* If some pages were found with data in them
+        * we are not in PBF_NONE state.
+        */
+       if (good_pages != 0) {
+               pb->pb_flags &= ~(PBF_NONE);
+               if (good_pages != page_count) {
+                       pb->pb_flags |= PBF_PARTIAL;
+               }
+       }
+
+       PB_TRACE(pb, "lookup_pages", (long)good_pages);
+
+       return rval;
+}
+
+/*
+ *     Finding and Reading Buffers
+ */
+
+/*
+ *     _pagebuf_find
+ *
+ *     Looks up, and creates if absent, a lockable buffer for
+ *     a given range of an inode.  The buffer is returned
+ *     locked.  If other overlapping buffers exist, they are
+ *     released before the new buffer is created and locked,
+ *     which may imply that this call will block until those buffers
+ *     are unlocked.  No I/O is implied by this call.
+ */
+STATIC page_buf_t *
+_pagebuf_find(                         /* find buffer for block        */
+       pb_target_t             *target,/* target for block             */
+       loff_t                  ioff,   /* starting offset of range     */
+       size_t                  isize,  /* length of range              */
+       page_buf_flags_t        flags,  /* PBF_TRYLOCK                  */
+       page_buf_t              *new_pb)/* newly allocated buffer       */
+{
+       loff_t                  range_base;
+       size_t                  range_length;
+       int                     hval;
+       pb_hash_t               *h;
+       struct list_head        *p;
+       page_buf_t              *pb;
+       int                     not_locked;
+
+       range_base = (ioff << BBSHIFT);
+       range_length = (isize << BBSHIFT);
+
+       /* Ensure we never do IOs smaller than the sector size */
+       BUG_ON(range_length < (1 << target->pbr_sshift));
+
+       /* Ensure we never do IOs that are not sector aligned */
+       BUG_ON(range_base & (loff_t)target->pbr_smask);
+
+       hval = _bhash(target->pbr_bdev, range_base);
+       h = &pbhash[hval];
+
+       spin_lock(&h->pb_hash_lock);
+       list_for_each(p, &h->pb_hash) {
+               pb = list_entry(p, page_buf_t, pb_hash_list);
+
+               if ((target == pb->pb_target) &&
+                   (pb->pb_file_offset == range_base) &&
+                   (pb->pb_buffer_length == range_length)) {
+                       if (pb->pb_flags & PBF_FREED)
+                               break;
+                       /* If we look at something bring it to the
+                        * front of the list for next time
+                        */
+                       list_del(&pb->pb_hash_list);
+                       list_add(&pb->pb_hash_list, &h->pb_hash);
+                       goto found;
+               }
+       }
+
+       /* No match found */
+       if (new_pb) {
+               _pagebuf_initialize(new_pb, target, range_base,
+                               range_length, flags | _PBF_LOCKABLE);
+               new_pb->pb_hash_index = hval;
+               h->pb_count++;
+               list_add(&new_pb->pb_hash_list, &h->pb_hash);
+       } else {
+               PB_STATS_INC(pb_miss_locked);
+       }
+
+       spin_unlock(&h->pb_hash_lock);
+       return (new_pb);
+
+found:
+       atomic_inc(&pb->pb_hold);
+       spin_unlock(&h->pb_hash_lock);
+
+       /* Attempt to get the semaphore without sleeping,
+        * if this does not work then we need to drop the
+        * spinlock and do a hard attempt on the semaphore.
+        */
+       not_locked = down_trylock(&pb->pb_sema);
+       if (not_locked) {
+               if (!(flags & PBF_TRYLOCK)) {
+                       /* wait for buffer ownership */
+                       PB_TRACE(pb, "get_lock", 0);
+                       pagebuf_lock(pb);
+                       PB_STATS_INC(pb_get_locked_waited);
+               } else {
+                       /* We asked for a trylock and failed, no need
+                        * to look at file offset and length here, we
+                        * know that this pagebuf at least overlaps our
+                        * pagebuf and is locked, therefore our buffer
+                        * either does not exist, or is this buffer
+                        */
+
+                       pagebuf_rele(pb);
+                       PB_STATS_INC(pb_busy_locked);
+                       return (NULL);
+               }
+       } else {
+               /* trylock worked */
+               PB_SET_OWNER(pb);
+       }
+
+       if (pb->pb_flags & PBF_STALE)
+               pb->pb_flags &= PBF_MAPPABLE | \
+                               PBF_MAPPED | \
+                               _PBF_LOCKABLE | \
+                               _PBF_ALL_PAGES_MAPPED | \
+                               _PBF_ADDR_ALLOCATED | \
+                               _PBF_MEM_ALLOCATED | \
+                               _PBF_MEM_SLAB;
+       PB_TRACE(pb, "got_lock", 0);
+       PB_STATS_INC(pb_get_locked);
+       return (pb);
+}
+
+
+/*
+ *     pagebuf_find
+ *
+ *     pagebuf_find returns a buffer matching the specified range of
+ *     data for the specified target, if any of the relevant blocks
+ *     are in memory.  The buffer may have unallocated holes, if
+ *     some, but not all, of the blocks are in memory.  Even where
+ *     pages are present in the buffer, not all of every page may be
+ *     valid.
+ */
+page_buf_t *
+pagebuf_find(                          /* find buffer for block        */
+                                       /* if the block is in memory    */
+       pb_target_t             *target,/* target for block             */
+       loff_t                  ioff,   /* starting offset of range     */
+       size_t                  isize,  /* length of range              */
+       page_buf_flags_t        flags)  /* PBF_TRYLOCK                  */
+{
+       return _pagebuf_find(target, ioff, isize, flags, NULL);
+}
+
+/*
+ *     pagebuf_get
+ *
+ *     pagebuf_get assembles a buffer covering the specified range.
+ *     Some or all of the blocks in the range may be valid.  Storage
+ *     in memory for all portions of the buffer will be allocated,
+ *     although backing storage may not be.  If PBF_READ is set in
+ *     flags, pagebuf_iostart is called also.
+ */
+page_buf_t *
+pagebuf_get(                           /* allocate a buffer            */
+       pb_target_t             *target,/* target for buffer            */
+       loff_t                  ioff,   /* starting offset of range     */
+       size_t                  isize,  /* length of range              */
+       page_buf_flags_t        flags)  /* PBF_TRYLOCK                  */
+{
+       page_buf_t              *pb, *new_pb;
+       int                     error;
+
+       new_pb = pagebuf_allocate(flags);
+       if (unlikely(!new_pb))
+               return (NULL);
+
+       pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
+       if (pb != new_pb) {
+               pagebuf_deallocate(new_pb);
+               if (unlikely(!pb))
+                       return (NULL);
+       }
+
+       PB_STATS_INC(pb_get);
+
+       /* fill in any missing pages */
+       error = _pagebuf_lookup_pages(pb, pb->pb_target->pbr_mapping, flags);
+       if (unlikely(error)) {
+               pagebuf_free(pb);
+               return (NULL);
+       }
+
+       /*
+        * Always fill in the block number now, the mapped cases can do
+        * their own overlay of this later.
+        */
+       pb->pb_bn = ioff;
+       pb->pb_count_desired = pb->pb_buffer_length;
+
+       if (flags & PBF_READ) {
+               if (PBF_NOT_DONE(pb)) {
+                       PB_TRACE(pb, "get_read", (unsigned long)flags);
+                       PB_STATS_INC(pb_get_read);
+                       pagebuf_iostart(pb, flags);
+               } else if (flags & PBF_ASYNC) {
+                       PB_TRACE(pb, "get_read_async", (unsigned long)flags);
+                       /*
+                        * Read ahead call which is already satisfied,
+                        * drop the buffer
+                        */
+                       if (flags & (PBF_LOCK | PBF_TRYLOCK))
+                               pagebuf_unlock(pb);
+                       pagebuf_rele(pb);
+                       return NULL;
+               } else {
+                       PB_TRACE(pb, "get_read_done", (unsigned long)flags);
+                       /* We do not want read in the flags */
+                       pb->pb_flags &= ~PBF_READ;
+               }
+       } else {
+               PB_TRACE(pb, "get_write", (unsigned long)flags);
+       }
+       return (pb);
+}
+
+/*
+ * Create a skeletal pagebuf (no pages associated with it).
+ */
+page_buf_t *
+pagebuf_lookup(
+       struct pb_target        *target,
+       loff_t                  ioff,
+       size_t                  isize,
+       page_buf_flags_t        flags)
+{
+       page_buf_t              *pb;
+
+       pb = pagebuf_allocate(flags);
+       if (pb) {
+               _pagebuf_initialize(pb, target, ioff, isize, flags);
+       }
+       return pb;
+}
+
+/*
+ * If we are not low on memory then do the readahead in a deadlock
+ * safe manner.
+ */
+void
+pagebuf_readahead(
+       pb_target_t             *target,
+       loff_t                  ioff,
+       size_t                  isize,
+       page_buf_flags_t        flags)
+{
+       struct backing_dev_info *bdi;
+
+       bdi = target->pbr_mapping->backing_dev_info;
+       if (bdi_read_congested(bdi))
+               return;
+       if (bdi_write_congested(bdi))
+               return;
+
+       flags |= (PBF_TRYLOCK|PBF_READ|PBF_ASYNC|PBF_MAPPABLE|PBF_READ_AHEAD);
+       pagebuf_get(target, ioff, isize, flags);
+}
+
+page_buf_t *
+pagebuf_get_empty(
+       size_t                  len,
+       pb_target_t             *target)
+{
+       page_buf_t              *pb;
+
+       pb = pagebuf_allocate(_PBF_LOCKABLE);
+       if (pb)
+               _pagebuf_initialize(pb, target, 0, len, _PBF_LOCKABLE);
+       return pb;
+}
+
+static inline struct page *
+mem_to_page(
+       void                    *addr)
+{
+       if (((unsigned long)addr < VMALLOC_START) ||
+           ((unsigned long)addr >= VMALLOC_END)) {
+               return virt_to_page(addr);
+       } else {
+               return vmalloc_to_page(addr);
+       }
+}
+
+int
+pagebuf_associate_memory(
+       page_buf_t              *pb,
+       void                    *mem,
+       size_t                  len)
+{
+       int                     rval;
+       int                     i = 0;
+       size_t                  ptr;
+       size_t                  end, end_cur;
+       off_t                   offset;
+       int                     page_count;
+
+       page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+       offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
+       if (offset && (len > PAGE_CACHE_SIZE))
+               page_count++;
+
+       /* Free any previous set of page pointers */
+       if (pb->pb_pages && (pb->pb_pages != pb->pb_page_array)) {
+               kfree(pb->pb_pages);
+       }
+       pb->pb_pages = NULL;
+       pb->pb_addr = mem;
+
+       rval = _pagebuf_get_pages(pb, page_count, 0);
+       if (rval)
+               return rval;
+
+       pb->pb_offset = offset;
+       ptr = (size_t) mem & PAGE_CACHE_MASK;
+       end = PAGE_CACHE_ALIGN((size_t) mem + len);
+       end_cur = end;
+       /* set up first page */
+       pb->pb_pages[0] = mem_to_page(mem);
+
+       ptr += PAGE_CACHE_SIZE;
+       pb->pb_page_count = ++i;
+       while (ptr < end) {
+               pb->pb_pages[i] = mem_to_page((void *)ptr);
+               pb->pb_page_count = ++i;
+               ptr += PAGE_CACHE_SIZE;
+       }
+       pb->pb_locked = 0;
+
+       pb->pb_count_desired = pb->pb_buffer_length = len;
+       pb->pb_flags |= PBF_MAPPED;
+
+       return 0;
+}
+
+page_buf_t *
+pagebuf_get_no_daddr(
+       size_t                  len,
+       pb_target_t             *target)
+{
+       int                     rval;
+       void                    *rmem = NULL;
+       page_buf_flags_t        flags = _PBF_LOCKABLE | PBF_FORCEIO;
+       page_buf_t              *pb;
+       size_t                  tlen = 0;
+
+       if (unlikely(len > 0x20000))
+               return NULL;
+
+       pb = pagebuf_allocate(flags);
+       if (!pb)
+               return NULL;
+
+       _pagebuf_initialize(pb, target, 0, len, flags);
+
+       do {
+               if (tlen == 0) {
+                       tlen = len; /* first time */
+               } else {
+                       kfree(rmem); /* free the mem from the previous try */
+                       tlen <<= 1; /* double the size and try again */
+               }
+               if ((rmem = kmalloc(tlen, GFP_KERNEL)) == 0) {
+                       pagebuf_free(pb);
+                       return NULL;
+               }
+       } while ((size_t)rmem != ((size_t)rmem & ~target->pbr_smask));
+
+       if ((rval = pagebuf_associate_memory(pb, rmem, len)) != 0) {
+               kfree(rmem);
+               pagebuf_free(pb);
+               return NULL;
+       }
+       /* otherwise pagebuf_free just ignores it */
+       pb->pb_flags |= (_PBF_MEM_ALLOCATED | _PBF_MEM_SLAB);
+       PB_CLEAR_OWNER(pb);
+       up(&pb->pb_sema);       /* Return unlocked pagebuf */
+
+       PB_TRACE(pb, "no_daddr", rmem);
+
+       return pb;
+}
+
+
+/*
+ *     pagebuf_hold
+ *
+ *     Increment reference count on buffer, to hold the buffer concurrently
+ *     with another thread which may release (free) the buffer asynchronously.
+ *
+ *     Must hold the buffer already to call this function.
+ */
+void
+pagebuf_hold(
+       page_buf_t              *pb)
+{
+       atomic_inc(&pb->pb_hold);
+       PB_TRACE(pb, "hold", 0);
+}
+
+/*
+ *     pagebuf_free
+ *
+ *     pagebuf_free releases the specified buffer.  The modification
+ *     state of any associated pages is left unchanged.
+ */
+void
+pagebuf_free(
+       page_buf_t              *pb)
+{
+       if (pb->pb_flags & _PBF_LOCKABLE) {
+               pb_hash_t       *h = pb_hash(pb);
+
+               spin_lock(&h->pb_hash_lock);
+               _pagebuf_free_object(h, pb);
+       } else {
+               _pagebuf_free_object(NULL, pb);
+       }
+}
+
+/*
+ *     pagebuf_rele
+ *
+ *     pagebuf_rele releases a hold on the specified buffer.  If the
+ *     the hold count is 1, pagebuf_rele calls pagebuf_free.
+ */
+void
+pagebuf_rele(
+       page_buf_t              *pb)
+{
+       pb_hash_t               *h;
+
+       PB_TRACE(pb, "rele", pb->pb_relse);
+       if (pb->pb_flags & _PBF_LOCKABLE) {
+               h = pb_hash(pb);
+               spin_lock(&h->pb_hash_lock);
+       } else {
+               h = NULL;
+       }
+
+       if (atomic_dec_and_test(&pb->pb_hold)) {
+               int             do_free = 1;
+
+               if (pb->pb_relse) {
+                       atomic_inc(&pb->pb_hold);
+                       if (h)
+                               spin_unlock(&h->pb_hash_lock);
+                       (*(pb->pb_relse)) (pb);
+                       do_free = 0;
+               }
+               if (pb->pb_flags & PBF_DELWRI) {
+                       pb->pb_flags |= PBF_ASYNC;
+                       atomic_inc(&pb->pb_hold);
+                       if (h && do_free)
+                               spin_unlock(&h->pb_hash_lock);
+                       pagebuf_delwri_queue(pb, 0);
+                       do_free = 0;
+               } else if (pb->pb_flags & PBF_FS_MANAGED) {
+                       if (h)
+                               spin_unlock(&h->pb_hash_lock);
+                       do_free = 0;
+               }
+
+               if (do_free) {
+                       _pagebuf_free_object(h, pb);
+               }
+       } else if (h) {
+               spin_unlock(&h->pb_hash_lock);
+       }
+}
+
+
+/*
+ *     Mutual exclusion on buffers.  Locking model:
+ *
+ *     Buffers associated with inodes for which buffer locking
+ *     is not enabled are not protected by semaphores, and are
+ *     assumed to be exclusively owned by the caller.  There is a
+ *     spinlock in the buffer, used by the caller when concurrent
+ *     access is possible.
+ */
+
+/*
+ *     pagebuf_cond_lock
+ *
+ *     pagebuf_cond_lock locks a buffer object, if it is not already locked.
+ *     Note that this in no way
+ *     locks the underlying pages, so it is only useful for synchronizing
+ *     concurrent use of page buffer objects, not for synchronizing independent
+ *     access to the underlying pages.
+ */
+int
+pagebuf_cond_lock(                     /* lock buffer, if not locked   */
+                                       /* returns -EBUSY if locked)    */
+       page_buf_t              *pb)
+{
+       int                     locked;
+
+       ASSERT(pb->pb_flags & _PBF_LOCKABLE);
+       locked = down_trylock(&pb->pb_sema) == 0;
+       if (locked) {
+               PB_SET_OWNER(pb);
+       }
+       PB_TRACE(pb, "cond_lock", (long)locked);
+       return(locked ? 0 : -EBUSY);
+}
+
+/*
+ *     pagebuf_lock_value
+ *
+ *     Return lock value for a pagebuf
+ */
+int
+pagebuf_lock_value(
+       page_buf_t              *pb)
+{
+       ASSERT(pb->pb_flags & _PBF_LOCKABLE);
+       return(atomic_read(&pb->pb_sema.count));
+}
+
+/*
+ *     pagebuf_lock
+ *
+ *     pagebuf_lock locks a buffer object.  Note that this in no way
+ *     locks the underlying pages, so it is only useful for synchronizing
+ *     concurrent use of page buffer objects, not for synchronizing independent
+ *     access to the underlying pages.
+ */
+int
+pagebuf_lock(
+       page_buf_t              *pb)
+{
+       ASSERT(pb->pb_flags & _PBF_LOCKABLE);
+
+       PB_TRACE(pb, "lock", 0);
+       if (atomic_read(&pb->pb_io_remaining))
+               blk_run_queues();
+       down(&pb->pb_sema);
+       PB_SET_OWNER(pb);
+       PB_TRACE(pb, "locked", 0);
+       return 0;
+}
+
+/*
+ *     pagebuf_unlock
+ *
+ *     pagebuf_unlock releases the lock on the buffer object created by
+ *     pagebuf_lock or pagebuf_cond_lock (not any
+ *     pinning of underlying pages created by pagebuf_pin).
+ */
+void
+pagebuf_unlock(                                /* unlock buffer                */
+       page_buf_t              *pb)    /* buffer to unlock             */
+{
+       ASSERT(pb->pb_flags & _PBF_LOCKABLE);
+       PB_CLEAR_OWNER(pb);
+       up(&pb->pb_sema);
+       PB_TRACE(pb, "unlock", 0);
+}
+
+
+/*
+ *     Pinning Buffer Storage in Memory
+ */
+
+/*
+ *     pagebuf_pin
+ *
+ *     pagebuf_pin locks all of the memory represented by a buffer in
+ *     memory.  Multiple calls to pagebuf_pin and pagebuf_unpin, for
+ *     the same or different buffers affecting a given page, will
+ *     properly count the number of outstanding "pin" requests.  The
+ *     buffer may be released after the pagebuf_pin and a different
+ *     buffer used when calling pagebuf_unpin, if desired.
+ *     pagebuf_pin should be used by the file system when it wants be
+ *     assured that no attempt will be made to force the affected
+ *     memory to disk.  It does not assure that a given logical page
+ *     will not be moved to a different physical page.
+ */
+void
+pagebuf_pin(
+       page_buf_t              *pb)
+{
+       atomic_inc(&pb->pb_pin_count);
+       PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
+}
+
+/*
+ *     pagebuf_unpin
+ *
+ *     pagebuf_unpin reverses the locking of memory performed by
+ *     pagebuf_pin.  Note that both functions affected the logical
+ *     pages associated with the buffer, not the buffer itself.
+ */
+void
+pagebuf_unpin(
+       page_buf_t              *pb)
+{
+       if (atomic_dec_and_test(&pb->pb_pin_count)) {
+               wake_up_all(&pb->pb_waiters);
+       }
+       PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
+}
+
+int
+pagebuf_ispin(
+       page_buf_t              *pb)
+{
+       return atomic_read(&pb->pb_pin_count);
+}
+
+/*
+ *     pagebuf_wait_unpin
+ *
+ *     pagebuf_wait_unpin waits until all of the memory associated
+ *     with the buffer is not longer locked in memory.  It returns
+ *     immediately if none of the affected pages are locked.
+ */
+static inline void
+_pagebuf_wait_unpin(
+       page_buf_t              *pb)
+{
+       DECLARE_WAITQUEUE       (wait, current);
+
+       if (atomic_read(&pb->pb_pin_count) == 0)
+               return;
+
+       add_wait_queue(&pb->pb_waiters, &wait);
+       for (;;) {
+               current->state = TASK_UNINTERRUPTIBLE;
+               if (atomic_read(&pb->pb_pin_count) == 0)
+                       break;
+               if (atomic_read(&pb->pb_io_remaining))
+                       blk_run_queues();
+               schedule();
+       }
+       remove_wait_queue(&pb->pb_waiters, &wait);
+       current->state = TASK_RUNNING;
+}
+
+/*
+ *     Buffer Utility Routines
+ */
+
+/*
+ *     pagebuf_iodone
+ *
+ *     pagebuf_iodone marks a buffer for which I/O is in progress
+ *     done with respect to that I/O.  The pb_iodone routine, if
+ *     present, will be called as a side-effect.
+ */
+void
+pagebuf_iodone_work(
+       void                    *v)
+{
+       page_buf_t              *pb = (page_buf_t *)v;
+
+       if (pb->pb_iodone) {
+               (*(pb->pb_iodone)) (pb);
+               return;
+       }
+
+       if (pb->pb_flags & PBF_ASYNC) {
+               if ((pb->pb_flags & _PBF_LOCKABLE) && !pb->pb_relse)
+                       pagebuf_unlock(pb);
+               pagebuf_rele(pb);
+       }
+}
+
+void
+pagebuf_iodone(
+       page_buf_t              *pb,
+       int                     dataio,
+       int                     schedule)
+{
+       pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
+       if (pb->pb_error == 0) {
+               pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE);
+       }
+
+       PB_TRACE(pb, "iodone", pb->pb_iodone);
+
+       if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
+               if (schedule) {
+                       INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb);
+                       queue_work(dataio ? pagebuf_dataio_workqueue :
+                               pagebuf_logio_workqueue, &pb->pb_iodone_work);
+               } else {
+                       pagebuf_iodone_work(pb);
+               }
+       } else {
+               up(&pb->pb_iodonesema);
+       }
+}
+
+/*
+ *     pagebuf_ioerror
+ *
+ *     pagebuf_ioerror sets the error code for a buffer.
+ */
+void
+pagebuf_ioerror(                       /* mark/clear buffer error flag */
+       page_buf_t              *pb,    /* buffer to mark               */
+       unsigned int            error)  /* error to store (0 if none)   */
+{
+       pb->pb_error = error;
+       PB_TRACE(pb, "ioerror", (unsigned long)error);
+}
+
+/*
+ *     pagebuf_iostart
+ *
+ *     pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
+ *     If necessary, it will arrange for any disk space allocation required,
+ *     and it will break up the request if the block mappings require it.
+ *     The pb_iodone routine in the buffer supplied will only be called
+ *     when all of the subsidiary I/O requests, if any, have been completed.
+ *     pagebuf_iostart calls the pagebuf_ioinitiate routine or
+ *     pagebuf_iorequest, if the former routine is not defined, to start
+ *     the I/O on a given low-level request.
+ */
+int
+pagebuf_iostart(                       /* start I/O on a buffer          */
+       page_buf_t              *pb,    /* buffer to start                */
+       page_buf_flags_t        flags)  /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
+                                       /* PBF_WRITE, PBF_DELWRI,         */
+                                       /* PBF_SYNC, PBF_DONT_BLOCK       */
+{
+       int                     status = 0;
+
+       PB_TRACE(pb, "iostart", (unsigned long)flags);
+
+       if (flags & PBF_DELWRI) {
+               pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
+               pb->pb_flags |= flags &
+                               (PBF_DELWRI | PBF_ASYNC | PBF_SYNC);
+               pagebuf_delwri_queue(pb, 1);
+               return status;
+       }
+
+       pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | \
+                       PBF_DELWRI | PBF_READ_AHEAD | PBF_RUN_QUEUES);
+       pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \
+                       PBF_SYNC | PBF_READ_AHEAD | PBF_RUN_QUEUES);
+
+       BUG_ON(pb->pb_bn == PAGE_BUF_DADDR_NULL);
+
+       /* For writes allow an alternate strategy routine to precede
+        * the actual I/O request (which may not be issued at all in
+        * a shutdown situation, for example).
+        */
+       status = (flags & PBF_WRITE) ?
+               pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
+
+       /* Wait for I/O if we are not an async request.
+        * Note: async I/O request completion will release the buffer,
+        * and that can already be done by this point.  So using the
+        * buffer pointer from here on, after async I/O, is invalid.
+        */
+       if (!status && !(flags & PBF_ASYNC))
+               status = pagebuf_iowait(pb);
+
+       return status;
+}
+
+/*
+ * Helper routine for pagebuf_iorequest
+ */
+
+STATIC __inline__ int
+_pagebuf_iolocked(
+       page_buf_t              *pb)
+{
+       ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
+       if (pb->pb_flags & PBF_READ)
+               return pb->pb_locked;
+       return ((pb->pb_flags & _PBF_LOCKABLE) == 0);
+}
+
+STATIC __inline__ void
+_pagebuf_iodone(
+       page_buf_t              *pb,
+       int                     schedule)
+{
+       if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
+               pb->pb_locked = 0;
+               pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
+       }
+}
+
+STATIC int
+bio_end_io_pagebuf(
+       struct bio              *bio,
+       unsigned int            bytes_done,
+       int                     error)
+{
+       page_buf_t              *pb = (page_buf_t *)bio->bi_private;
+       unsigned int            i, blocksize = pb->pb_target->pbr_bsize;
+       unsigned int            sectorshift = pb->pb_target->pbr_sshift;
+       struct bio_vec          *bvec = bio->bi_io_vec;
+
+       if (bio->bi_size)
+               return 1;
+
+       if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+               pb->pb_error = EIO;
+
+       for (i = 0; i < bio->bi_vcnt; i++, bvec++) {
+               struct page     *page = bvec->bv_page;
+
+               if (pb->pb_error) {
+                       SetPageError(page);
+               } else if (blocksize == PAGE_CACHE_SIZE) {
+                       SetPageUptodate(page);
+               } else if (!PagePrivate(page)) {
+                       unsigned int    j, range;
+
+                       ASSERT(blocksize < PAGE_CACHE_SIZE);
+                       range = (bvec->bv_offset + bvec->bv_len) >> sectorshift;
+                       for (j = bvec->bv_offset >> sectorshift; j < range; j++)
+                               set_bit(j, &page->private);
+                       if (page->private == (unsigned long)(PAGE_CACHE_SIZE-1))
+                               SetPageUptodate(page);
+               }
+
+               if (_pagebuf_iolocked(pb)) {
+                       unlock_page(page);
+               }
+       }
+
+       _pagebuf_iodone(pb, 1);
+       bio_put(bio);
+       return 0;
+}
+
+void
+_pagebuf_ioapply(
+       page_buf_t              *pb)
+{
+       int                     i, map_i, total_nr_pages, nr_pages;
+       struct bio              *bio;
+       int                     offset = pb->pb_offset;
+       int                     size = pb->pb_count_desired;
+       sector_t                sector = pb->pb_bn;
+       unsigned int            blocksize = pb->pb_target->pbr_bsize;
+       int                     locking = _pagebuf_iolocked(pb);
+
+       total_nr_pages = pb->pb_page_count;
+       map_i = 0;
+
+       /* Special code path for reading a sub page size pagebuf in --
+        * we populate up the whole page, and hence the other metadata
+        * in the same page.  This optimization is only valid when the
+        * filesystem block size and the page size are equal.
+        */
+       if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
+           (pb->pb_flags & PBF_READ) && locking &&
+           (blocksize == PAGE_CACHE_SIZE)) {
+               bio = bio_alloc(GFP_NOIO, 1);
+
+               bio->bi_bdev = pb->pb_target->pbr_bdev;
+               bio->bi_sector = sector - (offset >> BBSHIFT);
+               bio->bi_end_io = bio_end_io_pagebuf;
+               bio->bi_private = pb;
+
+               bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0);
+               size = 0;
+
+               atomic_inc(&pb->pb_io_remaining);
+
+               goto submit_io;
+       }
+
+       /* Lock down the pages which we need to for the request */
+       if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) {
+               for (i = 0; size; i++) {
+                       int             nbytes = PAGE_CACHE_SIZE - offset;
+                       struct page     *page = pb->pb_pages[i];
+
+                       if (nbytes > size)
+                               nbytes = size;
+
+                       lock_page(page);
+
+                       size -= nbytes;
+                       offset = 0;
+               }
+               offset = pb->pb_offset;
+               size = pb->pb_count_desired;
+       }
+
+next_chunk:
+       atomic_inc(&pb->pb_io_remaining);
+       nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
+       if (nr_pages > total_nr_pages)
+               nr_pages = total_nr_pages;
+
+       bio = bio_alloc(GFP_NOIO, nr_pages);
+       bio->bi_bdev = pb->pb_target->pbr_bdev;
+       bio->bi_sector = sector;
+       bio->bi_end_io = bio_end_io_pagebuf;
+       bio->bi_private = pb;
+
+       for (; size && nr_pages; nr_pages--, map_i++) {
+               int     nbytes = PAGE_CACHE_SIZE - offset;
+
+               if (nbytes > size)
+                       nbytes = size;
+
+               if (bio_add_page(bio, pb->pb_pages[map_i],
+                                       nbytes, offset) < nbytes)
+                       break;
+
+               offset = 0;
+               sector += nbytes >> BBSHIFT;
+               size -= nbytes;
+               total_nr_pages--;
+       }
+
+submit_io:
+       if (likely(bio->bi_size)) {
+               submit_bio((pb->pb_flags & PBF_READ) ? READ : WRITE, bio);
+               if (size)
+                       goto next_chunk;
+       } else {
+               bio_put(bio);
+               pagebuf_ioerror(pb, EIO);
+       }
+
+       if (pb->pb_flags & PBF_RUN_QUEUES) {
+               pb->pb_flags &= ~PBF_RUN_QUEUES;
+               if (atomic_read(&pb->pb_io_remaining) > 1)
+                       blk_run_queues();
+       }
+}
+
+/*
+ *     pagebuf_iorequest
+ *
+ *     pagebuf_iorequest is the core I/O request routine.
+ *     It assumes that the buffer is well-formed and
+ *     mapped and ready for physical I/O, unlike
+ *     pagebuf_iostart() and pagebuf_iophysio().  Those
+ *     routines call the pagebuf_ioinitiate routine to start I/O,
+ *     if it is present, or else call pagebuf_iorequest()
+ *     directly if the pagebuf_ioinitiate routine is not present.
+ *
+ *     This function will be responsible for ensuring access to the
+ *     pages is restricted whilst I/O is in progress - for locking
+ *     pagebufs the pagebuf lock is the mediator, for non-locking
+ *     pagebufs the pages will be locked. In the locking case we
+ *     need to use the pagebuf lock as multiple meta-data buffers
+ *     will reference the same page.
+ */
+int
+pagebuf_iorequest(                     /* start real I/O               */
+       page_buf_t              *pb)    /* buffer to convey to device   */
+{
+       PB_TRACE(pb, "iorequest", 0);
+
+       if (pb->pb_flags & PBF_DELWRI) {
+               pagebuf_delwri_queue(pb, 1);
+               return 0;
+       }
+
+       if (pb->pb_flags & PBF_WRITE) {
+               _pagebuf_wait_unpin(pb);
+       }
+
+       pagebuf_hold(pb);
+
+       /* Set the count to 1 initially, this will stop an I/O
+        * completion callout which happens before we have started
+        * all the I/O from calling pagebuf_iodone too early.
+        */
+       atomic_set(&pb->pb_io_remaining, 1);
+       _pagebuf_ioapply(pb);
+       _pagebuf_iodone(pb, 0);
+
+       pagebuf_rele(pb);
+       return 0;
+}
+
+/*
+ *     pagebuf_iowait
+ *
+ *     pagebuf_iowait waits for I/O to complete on the buffer supplied.
+ *     It returns immediately if no I/O is pending.  In any case, it returns
+ *     the error code, if any, or 0 if there is no error.
+ */
+int
+pagebuf_iowait(
+       page_buf_t              *pb)
+{
+       PB_TRACE(pb, "iowait", 0);
+       if (atomic_read(&pb->pb_io_remaining))
+               blk_run_queues();
+       down(&pb->pb_iodonesema);
+       PB_TRACE(pb, "iowaited", (long)pb->pb_error);
+       return pb->pb_error;
+}
+
+STATIC void *
+pagebuf_mapout_locked(
+       page_buf_t              *pb)
+{
+       void                    *old_addr = NULL;
+
+       if (pb->pb_flags & PBF_MAPPED) {
+               if (pb->pb_flags & _PBF_ADDR_ALLOCATED)
+                       old_addr = pb->pb_addr - pb->pb_offset;
+               pb->pb_addr = NULL;
+               pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED);
+       }
+
+       return old_addr;        /* Caller must free the address space,
+                                * we are under a spin lock, probably
+                                * not safe to do vfree here
+                                */
+}
+
+caddr_t
+pagebuf_offset(
+       page_buf_t              *pb,
+       size_t                  offset)
+{
+       struct page             *page;
+
+       offset += pb->pb_offset;
+
+       page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
+       return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
+}
+
+/*
+ *     pagebuf_iomove
+ *
+ *     Move data into or out of a buffer.
+ */
+void
+pagebuf_iomove(
+       page_buf_t              *pb,    /* buffer to process            */
+       size_t                  boff,   /* starting buffer offset       */
+       size_t                  bsize,  /* length to copy               */
+       caddr_t                 data,   /* data address                 */
+       page_buf_rw_t           mode)   /* read/write flag              */
+{
+       size_t                  bend, cpoff, csize;
+       struct page             *page;
+
+       bend = boff + bsize;
+       while (boff < bend) {
+               page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
+               cpoff = page_buf_poff(boff + pb->pb_offset);
+               csize = min_t(size_t,
+                             PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
+
+               ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
+
+               switch (mode) {
+               case PBRW_ZERO:
+                       memset(page_address(page) + cpoff, 0, csize);
+                       break;
+               case PBRW_READ:
+                       memcpy(data, page_address(page) + cpoff, csize);
+                       break;
+               case PBRW_WRITE:
+                       memcpy(page_address(page) + cpoff, data, csize);
+               }
+
+               boff += csize;
+               data += csize;
+       }
+}
+
+
+/*
+ * Pagebuf delayed write buffer handling
+ */
+
+STATIC int pbd_active = 1;
+STATIC LIST_HEAD(pbd_delwrite_queue);
+STATIC spinlock_t pbd_delwrite_lock = SPIN_LOCK_UNLOCKED;
+
+STATIC void
+pagebuf_delwri_queue(
+       page_buf_t              *pb,
+       int                     unlock)
+{
+       PB_TRACE(pb, "delwri_q", (long)unlock);
+       spin_lock(&pbd_delwrite_lock);
+       /* If already in the queue, dequeue and place at tail */
+       if (!list_empty(&pb->pb_list)) {
+               if (unlock) {
+                       atomic_dec(&pb->pb_hold);
+               }
+               list_del(&pb->pb_list);
+       }
+
+       list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
+       pb->pb_flushtime = jiffies + pb_params.age_buffer.val;
+       spin_unlock(&pbd_delwrite_lock);
+
+       if (unlock && (pb->pb_flags & _PBF_LOCKABLE)) {
+               pagebuf_unlock(pb);
+       }
+}
+
+void
+pagebuf_delwri_dequeue(
+       page_buf_t              *pb)
+{
+       PB_TRACE(pb, "delwri_uq", 0);
+       spin_lock(&pbd_delwrite_lock);
+       list_del_init(&pb->pb_list);
+       pb->pb_flags &= ~PBF_DELWRI;
+       spin_unlock(&pbd_delwrite_lock);
+}
+
+STATIC void
+pagebuf_runall_queues(
+       struct workqueue_struct *queue)
+{
+       flush_workqueue(queue);
+}
+
+/* Defines for pagebuf daemon */
+DECLARE_WAIT_QUEUE_HEAD(pbd_waitq);
+STATIC int force_flush;
+
+STATIC void
+pagebuf_daemon_wakeup(
+       int                     flag)
+{
+       force_flush = flag;
+       if (waitqueue_active(&pbd_waitq)) {
+               wake_up_interruptible(&pbd_waitq);
+       }
+}
+
+typedef void (*timeout_fn)(unsigned long);
+
+STATIC int
+pagebuf_daemon(
+       void                    *data)
+{
+       int                     count;
+       page_buf_t              *pb;
+       struct list_head        *curr, *next, tmp;
+       struct timer_list       pb_daemon_timer =
+               TIMER_INITIALIZER((timeout_fn)pagebuf_daemon_wakeup, 0, 0);
+
+       /*  Set up the thread  */
+       daemonize("pagebufd");
+
+       current->flags |= PF_MEMALLOC;
+
+       INIT_LIST_HEAD(&tmp);
+       do {
+               /* swsusp */
+               if (current->flags & PF_FREEZE)
+                       refrigerator(PF_IOTHREAD);
+
+               if (pbd_active == 1) {
+                       mod_timer(&pb_daemon_timer,
+                                 jiffies + pb_params.flush_interval.val);
+                       interruptible_sleep_on(&pbd_waitq);
+               }
+
+               if (pbd_active == 0) {
+                       del_timer_sync(&pb_daemon_timer);
+               }
+
+               spin_lock(&pbd_delwrite_lock);
+
+               count = 0;
+               list_for_each_safe(curr, next, &pbd_delwrite_queue) {
+                       pb = list_entry(curr, page_buf_t, pb_list);
+
+                       PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
+
+                       if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) &&
+                           (((pb->pb_flags & _PBF_LOCKABLE) == 0) ||
+                            !pagebuf_cond_lock(pb))) {
+
+                               if (!force_flush &&
+                                   time_before(jiffies, pb->pb_flushtime)) {
+                                       pagebuf_unlock(pb);
+                                       break;
+                               }
+
+                               pb->pb_flags &= ~PBF_DELWRI;
+                               pb->pb_flags |= PBF_WRITE;
+
+                               list_del(&pb->pb_list);
+                               list_add(&pb->pb_list, &tmp);
+
+                               count++;
+                       }
+               }
+
+               spin_unlock(&pbd_delwrite_lock);
+               while (!list_empty(&tmp)) {
+                       pb = list_entry(tmp.next, page_buf_t, pb_list);
+                       list_del_init(&pb->pb_list);
+
+                       pagebuf_iostrategy(pb);
+               }
+
+               if (as_list_len > 0)
+                       purge_addresses();
+               if (count)
+                       blk_run_queues();
+
+               force_flush = 0;
+       } while (pbd_active == 1);
+
+       pbd_active = -1;
+       wake_up_interruptible(&pbd_waitq);
+
+       return 0;
+}
+
+void
+pagebuf_delwri_flush(
+       pb_target_t             *target,
+       u_long                  flags,
+       int                     *pinptr)
+{
+       page_buf_t              *pb;
+       struct list_head        *curr, *next, tmp;
+       int                     pincount = 0;
+       int                     flush_cnt = 0;
+
+       pagebuf_runall_queues(pagebuf_dataio_workqueue);
+       pagebuf_runall_queues(pagebuf_logio_workqueue);
+
+       spin_lock(&pbd_delwrite_lock);
+       INIT_LIST_HEAD(&tmp);
+
+       list_for_each_safe(curr, next, &pbd_delwrite_queue) {
+               pb = list_entry(curr, page_buf_t, pb_list);
+
+               /*
+                * Skip other targets, markers and in progress buffers
+                */
+
+               if ((pb->pb_flags == 0) || (pb->pb_target != target) ||
+                   !(pb->pb_flags & PBF_DELWRI)) {
+                       continue;
+               }
+
+               PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
+               if (pagebuf_ispin(pb)) {
+                       pincount++;
+                       continue;
+               }
+
+               pb->pb_flags &= ~PBF_DELWRI;
+               pb->pb_flags |= PBF_WRITE;
+               list_move(&pb->pb_list, &tmp);
+       }
+       /* ok found all the items that can be worked on 
+        * drop the lock and process the private list */
+       spin_unlock(&pbd_delwrite_lock);
+
+       list_for_each_safe(curr, next, &tmp) {
+               pb = list_entry(curr, page_buf_t, pb_list);
+
+               if (flags & PBDF_WAIT)
+                       pb->pb_flags &= ~PBF_ASYNC;
+               else
+                       list_del_init(curr);
+
+               pagebuf_lock(pb);
+               pagebuf_iostrategy(pb);
+               if (++flush_cnt > 32) {
+                       blk_run_queues();
+                       flush_cnt = 0;
+               }
+       }
+
+       blk_run_queues();
+
+       while (!list_empty(&tmp)) {
+               pb = list_entry(tmp.next, page_buf_t, pb_list);
+
+               list_del_init(&pb->pb_list);
+               pagebuf_iowait(pb);
+               if (!pb->pb_relse)
+                       pagebuf_unlock(pb);
+               pagebuf_rele(pb);
+       }
+
+       if (pinptr)
+               *pinptr = pincount;
+}
+
+STATIC int
+pagebuf_daemon_start(void)
+{
+       int             rval;
+
+       pagebuf_logio_workqueue = create_workqueue("xfslogd");
+       if (!pagebuf_logio_workqueue)
+               return -ENOMEM;
+
+       pagebuf_dataio_workqueue = create_workqueue("xfsdatad");
+       if (!pagebuf_dataio_workqueue) {
+               destroy_workqueue(pagebuf_logio_workqueue);
+               return -ENOMEM;
+       }
+
+       rval = kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES);
+       if (rval < 0) {
+               destroy_workqueue(pagebuf_logio_workqueue);
+               destroy_workqueue(pagebuf_dataio_workqueue);
+       }
+
+       return rval;
+}
+
+/*
+ * pagebuf_daemon_stop
+ *
+ * Note: do not mark as __exit, it is called from pagebuf_terminate.
+ */
+STATIC void
+pagebuf_daemon_stop(void)
+{
+       pbd_active = 0;
+       wake_up_interruptible(&pbd_waitq);
+       wait_event_interruptible(pbd_waitq, pbd_active);
+       destroy_workqueue(pagebuf_logio_workqueue);
+       destroy_workqueue(pagebuf_dataio_workqueue);
+}
+
+
+/*
+ * Pagebuf sysctl interface
+ */
+
+STATIC int
+pb_stats_clear_handler(
+       ctl_table               *ctl,
+       int                     write,
+       struct file             *filp,
+       void                    *buffer,
+       size_t                  *lenp)
+{
+       int                     c, ret;
+       int                     *valp = ctl->data;
+
+       ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp);
+
+       if (!ret && write && *valp) {
+               printk("XFS Clearing pbstats\n");
+               for (c = 0; c < NR_CPUS; c++) {
+                       if (!cpu_possible(c)) continue;
+                               memset(&per_cpu(pbstats, c), 0,
+                                      sizeof(struct pbstats));
+               }
+               pb_params.stats_clear.val = 0;
+       }
+
+       return ret;
+}
+
+STATIC struct ctl_table_header *pagebuf_table_header;
+
+STATIC ctl_table pagebuf_table[] = {
+       {PB_FLUSH_INT, "flush_int", &pb_params.flush_interval.val,
+       sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+       &sysctl_intvec, NULL,
+       &pb_params.flush_interval.min, &pb_params.flush_interval.max},
+
+       {PB_FLUSH_AGE, "flush_age", &pb_params.age_buffer.val,
+       sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+       &sysctl_intvec, NULL, 
+       &pb_params.age_buffer.min, &pb_params.age_buffer.max},
+
+       {PB_STATS_CLEAR, "stats_clear", &pb_params.stats_clear.val,
+       sizeof(int), 0644, NULL, &pb_stats_clear_handler,
+       &sysctl_intvec, NULL, 
+       &pb_params.stats_clear.min, &pb_params.stats_clear.max},
+
+#ifdef PAGEBUF_TRACE
+       {PB_DEBUG, "debug", &pb_params.debug.val,
+       sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+       &sysctl_intvec, NULL, 
+       &pb_params.debug.min, &pb_params.debug.max},
+#endif
+       {0}
+};
+
+STATIC ctl_table pagebuf_dir_table[] = {
+       {VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table},
+       {0}
+};
+
+STATIC ctl_table pagebuf_root_table[] = {
+       {CTL_VM, "vm",  NULL, 0, 0555, pagebuf_dir_table},
+       {0}
+};
+
+#ifdef CONFIG_PROC_FS
+STATIC int
+pagebuf_readstats(
+       char                    *buffer,
+       char                    **start,
+       off_t                   offset,
+       int                     count,
+       int                     *eof,
+       void                    *data)
+{
+       int                     c, i, len, val;
+
+       len = 0;
+       len += sprintf(buffer + len, "pagebuf");
+       for (i = 0; i < sizeof(struct pbstats) / sizeof(u_int32_t); i++) {
+               val = 0;
+               for (c = 0 ; c < NR_CPUS; c++) {
+                       if (!cpu_possible(c)) continue;
+                       val += *(((u_int32_t*)&per_cpu(pbstats, c) + i));
+               }
+               len += sprintf(buffer + len, " %u", val);
+       }
+       buffer[len++] = '\n';
+
+       if (offset >= len) {
+               *start = buffer;
+               *eof = 1;
+               return 0;
+       }
+       *start = buffer + offset;
+       if ((len -= offset) > count)
+               return count;
+       *eof = 1;
+
+       return len;
+}
+#endif  /* CONFIG_PROC_FS */
+
+/*
+ *     Initialization and Termination
+ */
+
+int __init
+pagebuf_init(void)
+{
+       int                     i;
+
+       pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1);
+
+#ifdef CONFIG_PROC_FS
+       if (proc_mkdir("fs/pagebuf", 0))
+               create_proc_read_entry(
+                       "fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL);
+#endif
+
+       pagebuf_cache = kmem_cache_create("page_buf_t", sizeof(page_buf_t), 0,
+                       SLAB_HWCACHE_ALIGN, NULL, NULL);
+       if (pagebuf_cache == NULL) {
+               printk("pagebuf: couldn't init pagebuf cache\n");
+               pagebuf_terminate();
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < NHASH; i++) {
+               spin_lock_init(&pbhash[i].pb_hash_lock);
+               INIT_LIST_HEAD(&pbhash[i].pb_hash);
+       }
+
+#ifdef PAGEBUF_TRACE
+       pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
+#endif
+
+       pagebuf_daemon_start();
+       return 0;
+}
+
+
+/*
+ *     pagebuf_terminate.
+ *
+ *     Note: do not mark as __exit, this is also called from the __init code.
+ */
+void
+pagebuf_terminate(void)
+{
+       pagebuf_daemon_stop();
+
+       kmem_cache_destroy(pagebuf_cache);
+
+       unregister_sysctl_table(pagebuf_table_header);
+#ifdef  CONFIG_PROC_FS
+       remove_proc_entry("fs/pagebuf/stat", NULL);
+       remove_proc_entry("fs/pagebuf", NULL);
+#endif
+}
+
+
+/*
+ *     Module management (for kernel debugger module)
+ */
+EXPORT_SYMBOL(pagebuf_offset);
+#ifdef DEBUG
+EXPORT_SYMBOL(pbd_delwrite_queue);
+#endif
diff --git a/fs/xfs/linux/xfs_buf.h b/fs/xfs/linux/xfs_buf.h
new file mode 100644 (file)
index 0000000..809a845
--- /dev/null
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI
+ */
+
+#ifndef __XFS_BUF_H__
+#define __XFS_BUF_H__
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <asm/system.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/uio.h>
+
+/*
+ *     Base types
+ */
+
+/* daddr must be signed since -1 is used for bmaps that are not yet allocated */
+typedef loff_t page_buf_daddr_t;
+
+#define PAGE_BUF_DADDR_NULL ((page_buf_daddr_t) (-1LL))
+
+#define page_buf_ctob(pp)      ((pp) * PAGE_CACHE_SIZE)
+#define page_buf_btoc(dd)      (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT)
+#define page_buf_btoct(dd)     ((dd) >> PAGE_CACHE_SHIFT)
+#define page_buf_poff(aa)      ((aa) & ~PAGE_CACHE_MASK)
+
+typedef enum page_buf_rw_e {
+       PBRW_READ = 1,                  /* transfer into target memory */
+       PBRW_WRITE = 2,                 /* transfer from target memory */
+       PBRW_ZERO = 3                   /* Zero target memory */
+} page_buf_rw_t;
+
+
+typedef enum page_buf_flags_e {                /* pb_flags values */
+       PBF_READ = (1 << 0),    /* buffer intended for reading from device */
+       PBF_WRITE = (1 << 1),   /* buffer intended for writing to device   */
+       PBF_MAPPED = (1 << 2),  /* buffer mapped (pb_addr valid)           */
+       PBF_PARTIAL = (1 << 3), /* buffer partially read                   */
+       PBF_ASYNC = (1 << 4),   /* initiator will not wait for completion  */
+       PBF_NONE = (1 << 5),    /* buffer not read at all                  */
+       PBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
+       PBF_FREED = (1 << 7),   /* buffer has been freed and is invalid    */
+       PBF_SYNC = (1 << 8),    /* force updates to disk                   */
+       PBF_MAPPABLE = (1 << 9),/* use directly-addressable pages          */
+       PBF_STALE = (1 << 10),  /* buffer has been staled, do not find it  */
+       PBF_FS_MANAGED = (1 << 11), /* filesystem controls freeing memory  */
+       PBF_FS_DATAIOD = (1 << 12), /* schedule IO completion on fs datad  */
+
+       /* flags used only as arguments to access routines */
+       PBF_LOCK = (1 << 13),   /* lock requested                          */
+       PBF_TRYLOCK = (1 << 14), /* lock requested, but do not wait        */
+       PBF_DONT_BLOCK = (1 << 15), /* do not block in current thread      */
+
+       /* flags used only internally */
+       _PBF_LOCKABLE = (1 << 16), /* page_buf_t may be locked             */
+       _PBF_PRIVATE_BH = (1 << 17), /* do not use public buffer heads     */
+       _PBF_ALL_PAGES_MAPPED = (1 << 18), /* all pages in range mapped    */
+       _PBF_ADDR_ALLOCATED = (1 << 19), /* pb_addr space was allocated    */
+       _PBF_MEM_ALLOCATED = (1 << 20), /* underlying pages are allocated  */
+       _PBF_MEM_SLAB = (1 << 21), /* underlying pages are slab allocated  */
+
+       PBF_FORCEIO = (1 << 22), /* ignore any cache state                 */
+       PBF_FLUSH = (1 << 23),  /* flush disk write cache                  */
+       PBF_READ_AHEAD = (1 << 24), /* asynchronous read-ahead             */
+       PBF_RUN_QUEUES = (1 << 25), /* run block device task queue         */
+
+} page_buf_flags_t;
+
+#define PBF_UPDATE (PBF_READ | PBF_WRITE)
+#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0)
+#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0)
+
+typedef struct pb_target {
+       dev_t                   pbr_dev;
+       struct block_device     *pbr_bdev;
+       struct address_space    *pbr_mapping;
+       unsigned int            pbr_bsize;
+       unsigned int            pbr_sshift;
+       size_t                  pbr_smask;
+} pb_target_t;
+
+/*
+ *     page_buf_t:  Buffer structure for page cache-based buffers
+ *
+ * This buffer structure is used by the page cache buffer management routines
+ * to refer to an assembly of pages forming a logical buffer.  The actual
+ * I/O is performed with buffer_head or bio structures, as required by drivers,
+ * for drivers which do not understand this structure.  The buffer structure is
+ * used on temporary basis only, and discarded when released.
+ *
+ * The real data storage is recorded in the page cache.  Metadata is
+ * hashed to the inode for the block device on which the file system resides.
+ * File data is hashed to the inode for the file.  Pages which are only
+ * partially filled with data have bits set in their block_map entry
+ * to indicate which disk blocks in the page are not valid.
+ */
+
+struct page_buf_s;
+typedef void (*page_buf_iodone_t)(struct page_buf_s *);
+                       /* call-back function on I/O completion */
+typedef void (*page_buf_relse_t)(struct page_buf_s *);
+                       /* call-back function on I/O completion */
+typedef int (*page_buf_bdstrat_t)(struct page_buf_s *);
+
+#define PB_PAGES       4
+
+typedef struct page_buf_s {
+       struct semaphore        pb_sema;        /* semaphore for lockables  */
+       unsigned long           pb_flushtime;   /* time to flush pagebuf    */
+       atomic_t                pb_pin_count;   /* pin count                */
+       wait_queue_head_t       pb_waiters;     /* unpin waiters            */
+       struct list_head        pb_list;
+       page_buf_flags_t        pb_flags;       /* status flags */
+       struct list_head        pb_hash_list;
+       struct pb_target        *pb_target;     /* logical object */
+       atomic_t                pb_hold;        /* reference count */
+       page_buf_daddr_t        pb_bn;          /* block number for I/O */
+       loff_t                  pb_file_offset; /* offset in file */
+       size_t                  pb_buffer_length; /* size of buffer in bytes */
+       size_t                  pb_count_desired; /* desired transfer size */
+       void                    *pb_addr;       /* virtual address of buffer */
+       struct work_struct      pb_iodone_work;
+       atomic_t                pb_io_remaining;/* #outstanding I/O requests */
+       page_buf_iodone_t       pb_iodone;      /* I/O completion function */
+       page_buf_relse_t        pb_relse;       /* releasing function */
+       page_buf_bdstrat_t      pb_strat;       /* pre-write function */
+       struct semaphore        pb_iodonesema;  /* Semaphore for I/O waiters */
+       void                    *pb_fspriv;
+       void                    *pb_fspriv2;
+       void                    *pb_fspriv3;
+       unsigned short          pb_error;       /* error code on I/O */
+       unsigned short          pb_page_count;  /* size of page array */
+       unsigned short          pb_offset;      /* page offset in first page */
+       unsigned char           pb_locked;      /* page array is locked */
+       unsigned char           pb_hash_index;  /* hash table index     */
+       struct page             **pb_pages;     /* array of page pointers */
+       struct page             *pb_page_array[PB_PAGES]; /* inline pages */
+#ifdef PAGEBUF_LOCK_TRACKING
+       int                     pb_last_holder;
+#endif
+} page_buf_t;
+
+
+/* Finding and Reading Buffers */
+
+extern page_buf_t *pagebuf_find(       /* find buffer for block if     */
+                                       /* the block is in memory       */
+               struct pb_target *,     /* inode for block              */
+               loff_t,                 /* starting offset of range     */
+               size_t,                 /* length of range              */
+               page_buf_flags_t);      /* PBF_LOCK                     */
+
+extern page_buf_t *pagebuf_get(                /* allocate a buffer            */
+               struct pb_target *,     /* inode for buffer             */
+               loff_t,                 /* starting offset of range     */
+               size_t,                 /* length of range              */
+               page_buf_flags_t);      /* PBF_LOCK, PBF_READ,          */
+                                       /* PBF_ASYNC                    */
+
+extern page_buf_t *pagebuf_lookup(
+               struct pb_target *,
+               loff_t,                 /* starting offset of range     */
+               size_t,                 /* length of range              */
+               page_buf_flags_t);      /* PBF_READ, PBF_WRITE,         */
+                                       /* PBF_FORCEIO, _PBF_LOCKABLE   */
+
+extern page_buf_t *pagebuf_get_empty(  /* allocate pagebuf struct with */
+                                       /*  no memory or disk address   */
+               size_t len,
+               struct pb_target *);    /* mount point "fake" inode     */
+
+extern page_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct     */
+                                       /* without disk address         */
+               size_t len,
+               struct pb_target *);    /* mount point "fake" inode     */
+
+extern int pagebuf_associate_memory(
+               page_buf_t *,
+               void *,
+               size_t);
+
+extern void pagebuf_hold(              /* increment reference count    */
+               page_buf_t *);          /* buffer to hold               */
+
+extern void pagebuf_readahead(         /* read ahead into cache        */
+               struct pb_target  *,    /* target for buffer (or NULL)  */
+               loff_t,                 /* starting offset of range     */
+               size_t,                 /* length of range              */
+               page_buf_flags_t);      /* additional read flags        */
+
+/* Releasing Buffers */
+
+extern void pagebuf_free(              /* deallocate a buffer          */
+               page_buf_t *);          /* buffer to deallocate         */
+
+extern void pagebuf_rele(              /* release hold on a buffer     */
+               page_buf_t *);          /* buffer to release            */
+
+/* Locking and Unlocking Buffers */
+
+extern int pagebuf_cond_lock(          /* lock buffer, if not locked   */
+                                       /* (returns -EBUSY if locked)   */
+               page_buf_t *);          /* buffer to lock               */
+
+extern int pagebuf_lock_value(         /* return count on lock         */
+               page_buf_t *);          /* buffer to check              */
+
+extern int pagebuf_lock(               /* lock buffer                  */
+               page_buf_t *);          /* buffer to lock               */
+
+extern void pagebuf_unlock(            /* unlock buffer                */
+               page_buf_t *);          /* buffer to unlock             */
+
+/* Buffer Read and Write Routines */
+
+extern void pagebuf_iodone(            /* mark buffer I/O complete     */
+               page_buf_t *,           /* buffer to mark               */
+               int,                    /* use data/log helper thread.  */
+               int);                   /* run completion locally, or in
+                                        * a helper thread.             */
+
+extern void pagebuf_ioerror(           /* mark buffer in error (or not) */
+               page_buf_t *,           /* buffer to mark               */
+               unsigned int);          /* error to store (0 if none)   */
+
+extern int pagebuf_iostart(            /* start I/O on a buffer        */
+               page_buf_t *,           /* buffer to start              */
+               page_buf_flags_t);      /* PBF_LOCK, PBF_ASYNC,         */
+                                       /* PBF_READ, PBF_WRITE,         */
+                                       /* PBF_DELWRI, PBF_SYNC         */
+
+extern int pagebuf_iorequest(          /* start real I/O               */
+               page_buf_t *);          /* buffer to convey to device   */
+
+extern int pagebuf_iowait(             /* wait for buffer I/O done     */
+               page_buf_t *);          /* buffer to wait on            */
+
+extern void pagebuf_iomove(            /* move data in/out of pagebuf  */
+               page_buf_t *,           /* buffer to manipulate         */
+               size_t,                 /* starting buffer offset       */
+               size_t,                 /* length in buffer             */
+               caddr_t,                /* data pointer                 */
+               page_buf_rw_t);         /* direction                    */
+
+static inline int pagebuf_iostrategy(page_buf_t *pb)
+{
+       return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb);
+}
+
+static inline int pagebuf_geterror(page_buf_t *pb)
+{
+       return pb ? pb->pb_error : ENOMEM;
+}
+
+/* Buffer Utility Routines */
+
+extern caddr_t pagebuf_offset(         /* pointer at offset in buffer  */
+               page_buf_t *,           /* buffer to offset into        */
+               size_t);                /* offset                       */
+
+/* Pinning Buffer Storage in Memory */
+
+extern void pagebuf_pin(               /* pin buffer in memory         */
+               page_buf_t *);          /* buffer to pin                */
+
+extern void pagebuf_unpin(             /* unpin buffered data          */
+               page_buf_t *);          /* buffer to unpin              */
+
+extern int pagebuf_ispin(              /* check if buffer is pinned    */
+               page_buf_t *);          /* buffer to check              */
+
+/* Delayed Write Buffer Routines */
+
+#define PBDF_WAIT    0x01
+extern void pagebuf_delwri_flush(
+               pb_target_t *,
+               unsigned long,
+               int *);
+
+extern void pagebuf_delwri_dequeue(
+               page_buf_t *);
+
+/* Buffer Daemon Setup Routines */
+
+extern int pagebuf_init(void);
+extern void pagebuf_terminate(void);
+
+
+#ifdef PAGEBUF_TRACE
+extern ktrace_t *pagebuf_trace_buf;
+extern void pagebuf_trace(
+               page_buf_t *,           /* buffer being traced          */
+               char *,                 /* description of operation     */
+               void *,                 /* arbitrary diagnostic value   */
+               void *);                /* return address               */
+#else
+# define pagebuf_trace(pb, id, ptr, ra)        do { } while (0)
+#endif
+
+#define pagebuf_target_name(target)    \
+       ({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; })
+
+
+
+
+
+/* These are just for xfs_syncsub... it sets an internal variable
+ * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t
+ */
+#define XFS_B_ASYNC            PBF_ASYNC
+#define XFS_B_DELWRI           PBF_DELWRI
+#define XFS_B_READ             PBF_READ
+#define XFS_B_WRITE            PBF_WRITE
+#define XFS_B_STALE            PBF_STALE
+
+#define XFS_BUF_TRYLOCK                PBF_TRYLOCK
+#define XFS_INCORE_TRYLOCK     PBF_TRYLOCK
+#define XFS_BUF_LOCK           PBF_LOCK
+#define XFS_BUF_MAPPED         PBF_MAPPED
+
+#define BUF_BUSY               PBF_DONT_BLOCK
+
+#define XFS_BUF_BFLAGS(x)      ((x)->pb_flags)
+#define XFS_BUF_ZEROFLAGS(x)   \
+       ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_DELWRI))
+
+#define XFS_BUF_STALE(x)       ((x)->pb_flags |= XFS_B_STALE)
+#define XFS_BUF_UNSTALE(x)     ((x)->pb_flags &= ~XFS_B_STALE)
+#define XFS_BUF_ISSTALE(x)     ((x)->pb_flags & XFS_B_STALE)
+#define XFS_BUF_SUPER_STALE(x) do {                            \
+                                       XFS_BUF_STALE(x);       \
+                                       xfs_buf_undelay(x);     \
+                                       XFS_BUF_DONE(x);        \
+                               } while (0)
+
+#define XFS_BUF_MANAGE         PBF_FS_MANAGED
+#define XFS_BUF_UNMANAGE(x)    ((x)->pb_flags &= ~PBF_FS_MANAGED)
+
+static inline void xfs_buf_undelay(page_buf_t *pb)
+{
+       if (pb->pb_flags & PBF_DELWRI) {
+               if (pb->pb_list.next != &pb->pb_list) {
+                       pagebuf_delwri_dequeue(pb);
+                       pagebuf_rele(pb);
+               } else {
+                       pb->pb_flags &= ~PBF_DELWRI;
+               }
+       }
+}
+
+#define XFS_BUF_DELAYWRITE(x)   ((x)->pb_flags |= PBF_DELWRI)
+#define XFS_BUF_UNDELAYWRITE(x)         xfs_buf_undelay(x)
+#define XFS_BUF_ISDELAYWRITE(x)         ((x)->pb_flags & PBF_DELWRI)
+
+#define XFS_BUF_ERROR(x,no)     pagebuf_ioerror(x,no)
+#define XFS_BUF_GETERROR(x)     pagebuf_geterror(x)
+#define XFS_BUF_ISERROR(x)      (pagebuf_geterror(x)?1:0)
+
+#define XFS_BUF_DONE(x)                 ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE))
+#define XFS_BUF_UNDONE(x)       ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE)
+#define XFS_BUF_ISDONE(x)       (!(PBF_NOT_DONE(x)))
+
+#define XFS_BUF_BUSY(x)                 ((x)->pb_flags |= PBF_FORCEIO)
+#define XFS_BUF_UNBUSY(x)       ((x)->pb_flags &= ~PBF_FORCEIO)
+#define XFS_BUF_ISBUSY(x)       (1)
+
+#define XFS_BUF_ASYNC(x)        ((x)->pb_flags |= PBF_ASYNC)
+#define XFS_BUF_UNASYNC(x)      ((x)->pb_flags &= ~PBF_ASYNC)
+#define XFS_BUF_ISASYNC(x)      ((x)->pb_flags & PBF_ASYNC)
+
+#define XFS_BUF_FLUSH(x)        ((x)->pb_flags |= PBF_FLUSH)
+#define XFS_BUF_UNFLUSH(x)      ((x)->pb_flags &= ~PBF_FLUSH)
+#define XFS_BUF_ISFLUSH(x)      ((x)->pb_flags & PBF_FLUSH)
+
+#define XFS_BUF_SHUT(x)                 printk("XFS_BUF_SHUT not implemented yet\n")
+#define XFS_BUF_UNSHUT(x)       printk("XFS_BUF_UNSHUT not implemented yet\n")
+#define XFS_BUF_ISSHUT(x)       (0)
+
+#define XFS_BUF_HOLD(x)                pagebuf_hold(x)
+#define XFS_BUF_READ(x)                ((x)->pb_flags |= PBF_READ)
+#define XFS_BUF_UNREAD(x)      ((x)->pb_flags &= ~PBF_READ)
+#define XFS_BUF_ISREAD(x)      ((x)->pb_flags & PBF_READ)
+
+#define XFS_BUF_WRITE(x)       ((x)->pb_flags |= PBF_WRITE)
+#define XFS_BUF_UNWRITE(x)     ((x)->pb_flags &= ~PBF_WRITE)
+#define XFS_BUF_ISWRITE(x)     ((x)->pb_flags & PBF_WRITE)
+
+#define XFS_BUF_ISUNINITIAL(x)  (0)
+#define XFS_BUF_UNUNINITIAL(x)  (0)
+
+#define XFS_BUF_BP_ISMAPPED(bp)         1
+
+typedef struct page_buf_s xfs_buf_t;
+#define xfs_buf page_buf_s
+
+typedef struct pb_target xfs_buftarg_t;
+#define xfs_buftarg pb_target
+
+#define XFS_BUF_DATAIO(x)      ((x)->pb_flags |= PBF_FS_DATAIOD)
+#define XFS_BUF_UNDATAIO(x)    ((x)->pb_flags &= ~PBF_FS_DATAIOD)
+
+#define XFS_BUF_IODONE_FUNC(buf)       (buf)->pb_iodone
+#define XFS_BUF_SET_IODONE_FUNC(buf, func)     \
+                       (buf)->pb_iodone = (func)
+#define XFS_BUF_CLR_IODONE_FUNC(buf)           \
+                       (buf)->pb_iodone = NULL
+#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func)    \
+                       (buf)->pb_strat = (func)
+#define XFS_BUF_CLR_BDSTRAT_FUNC(buf)          \
+                       (buf)->pb_strat = NULL
+
+#define XFS_BUF_FSPRIVATE(buf, type)           \
+                       ((type)(buf)->pb_fspriv)
+#define XFS_BUF_SET_FSPRIVATE(buf, value)      \
+                       (buf)->pb_fspriv = (void *)(value)
+#define XFS_BUF_FSPRIVATE2(buf, type)          \
+                       ((type)(buf)->pb_fspriv2)
+#define XFS_BUF_SET_FSPRIVATE2(buf, value)     \
+                       (buf)->pb_fspriv2 = (void *)(value)
+#define XFS_BUF_FSPRIVATE3(buf, type)          \
+                       ((type)(buf)->pb_fspriv3)
+#define XFS_BUF_SET_FSPRIVATE3(buf, value)     \
+                       (buf)->pb_fspriv3  = (void *)(value)
+#define XFS_BUF_SET_START(buf)
+
+#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \
+                       (buf)->pb_relse = (value)
+
+#define XFS_BUF_PTR(bp)                (xfs_caddr_t)((bp)->pb_addr)
+
+extern inline xfs_caddr_t xfs_buf_offset(page_buf_t *bp, size_t offset)
+{
+       if (bp->pb_flags & PBF_MAPPED)
+               return XFS_BUF_PTR(bp) + offset;
+       return (xfs_caddr_t) pagebuf_offset(bp, offset);
+}
+
+#define XFS_BUF_SET_PTR(bp, val, count)                \
+                               pagebuf_associate_memory(bp, val, count)
+#define XFS_BUF_ADDR(bp)       ((bp)->pb_bn)
+#define XFS_BUF_SET_ADDR(bp, blk)              \
+                       ((bp)->pb_bn = (page_buf_daddr_t)(blk))
+#define XFS_BUF_OFFSET(bp)     ((bp)->pb_file_offset)
+#define XFS_BUF_SET_OFFSET(bp, off)            \
+                       ((bp)->pb_file_offset = (off))
+#define XFS_BUF_COUNT(bp)      ((bp)->pb_count_desired)
+#define XFS_BUF_SET_COUNT(bp, cnt)             \
+                       ((bp)->pb_count_desired = (cnt))
+#define XFS_BUF_SIZE(bp)       ((bp)->pb_buffer_length)
+#define XFS_BUF_SET_SIZE(bp, cnt)              \
+                       ((bp)->pb_buffer_length = (cnt))
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
+#define XFS_BUF_SET_VTYPE(bp, type)
+#define XFS_BUF_SET_REF(bp, ref)
+
+#define XFS_BUF_ISPINNED(bp)   pagebuf_ispin(bp)
+
+#define XFS_BUF_VALUSEMA(bp)   pagebuf_lock_value(bp)
+#define XFS_BUF_CPSEMA(bp)     (pagebuf_cond_lock(bp) == 0)
+#define XFS_BUF_VSEMA(bp)      pagebuf_unlock(bp)
+#define XFS_BUF_PSEMA(bp,x)    pagebuf_lock(bp)
+#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema);
+
+/* setup the buffer target from a buftarg structure */
+#define XFS_BUF_SET_TARGET(bp, target) \
+               (bp)->pb_target = (target)
+#define XFS_BUF_TARGET(bp)     ((bp)->pb_target)
+#define XFS_BUFTARG_NAME(target)       \
+               pagebuf_target_name(target)
+
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
+#define XFS_BUF_SET_VTYPE(bp, type)
+#define XFS_BUF_SET_REF(bp, ref)
+
+#define xfs_buf_read(target, blkno, len, flags) \
+               pagebuf_get((target), (blkno), (len), \
+                       PBF_LOCK | PBF_READ | PBF_MAPPED | PBF_MAPPABLE)
+#define xfs_buf_get(target, blkno, len, flags) \
+               pagebuf_get((target), (blkno), (len), \
+                       PBF_LOCK | PBF_MAPPED | PBF_MAPPABLE)
+
+#define xfs_buf_read_flags(target, blkno, len, flags) \
+               pagebuf_get((target), (blkno), (len), \
+                       PBF_READ | PBF_MAPPABLE | flags)
+#define xfs_buf_get_flags(target, blkno, len, flags) \
+               pagebuf_get((target), (blkno), (len), \
+                       PBF_MAPPABLE | flags)
+
+static inline int      xfs_bawrite(void *mp, page_buf_t *bp)
+{
+       bp->pb_fspriv3 = mp;
+       bp->pb_strat = xfs_bdstrat_cb;
+       xfs_buf_undelay(bp);
+       return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | PBF_RUN_QUEUES);
+}
+
+static inline void     xfs_buf_relse(page_buf_t *bp)
+{
+       if ((bp->pb_flags & _PBF_LOCKABLE) && !bp->pb_relse)
+               pagebuf_unlock(bp);
+       pagebuf_rele(bp);
+}
+
+#define xfs_bpin(bp)           pagebuf_pin(bp)
+#define xfs_bunpin(bp)         pagebuf_unpin(bp)
+
+#define xfs_buftrace(id, bp)   \
+           pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
+
+#define xfs_biodone(pb)                    \
+           pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), 0)
+
+#define xfs_incore(buftarg,blkno,len,lockit) \
+           pagebuf_find(buftarg, blkno ,len, lockit)
+
+
+#define xfs_biomove(pb, off, len, data, rw) \
+           pagebuf_iomove((pb), (off), (len), (data), \
+               ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ)
+
+#define xfs_biozero(pb, off, len) \
+           pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO)
+
+
+static inline int      XFS_bwrite(page_buf_t *pb)
+{
+       int     iowait = (pb->pb_flags & PBF_ASYNC) == 0;
+       int     error = 0;
+
+       pb->pb_flags |= PBF_SYNC;
+       if (!iowait)
+               pb->pb_flags |= PBF_RUN_QUEUES;
+
+       xfs_buf_undelay(pb);
+       pagebuf_iostrategy(pb);
+       if (iowait) {
+               error = pagebuf_iowait(pb);
+               xfs_buf_relse(pb);
+       }
+       return error;
+}
+
+#define XFS_bdwrite(pb)                     \
+           pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC)
+
+static inline int xfs_bdwrite(void *mp, page_buf_t *bp)
+{
+       bp->pb_strat = xfs_bdstrat_cb;
+       bp->pb_fspriv3 = mp;
+
+       return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC);
+}
+
+#define XFS_bdstrat(bp) pagebuf_iorequest(bp)
+
+#define xfs_iowait(pb) pagebuf_iowait(pb)
+
+
+/*
+ * Go through all incore buffers, and release buffers
+ * if they belong to the given device. This is used in
+ * filesystem error handling to preserve the consistency
+ * of its metadata.
+ */
+
+#define xfs_binval(buftarg)    xfs_flush_buftarg(buftarg)
+
+#define XFS_bflush(buftarg)    xfs_flush_buftarg(buftarg)
+
+#define xfs_incore_relse(buftarg,delwri_only,wait)     \
+       xfs_relse_buftarg(buftarg)
+
+#define xfs_baread(target, rablkno, ralen)  \
+       pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK)
+
+#define xfs_buf_get_empty(len, target) pagebuf_get_empty((len), (target))
+#define xfs_buf_get_noaddr(len, target)        pagebuf_get_no_daddr((len), (target))
+#define xfs_buf_free(bp)               pagebuf_free(bp)
+
+#endif /* __XFS_BUF_H__ */
+
diff --git a/fs/xfs/linux/xfs_iomap.c b/fs/xfs/linux/xfs_iomap.c
deleted file mode 100644 (file)
index ae4818a..0000000
+++ /dev/null
@@ -1,795 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.         Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-
-#include "xfs.h"
-
-#include "xfs_fs.h"
-#include "xfs_inum.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_bit.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_trans_space.h"
-#include "xfs_utils.h"
-#include "xfs_iomap.h"
-
-#define XFS_WRITEIO_ALIGN(mp,off)      (((off) >> mp->m_writeio_log) \
-                                               << mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS  2
-#define XFS_WRITE_IMAPS                XFS_BMAP_MAX_NMAP
-
-STATIC int
-xfs_imap_to_bmap(
-       xfs_iocore_t    *io,
-       xfs_off_t       offset,
-       xfs_bmbt_irec_t *imap,
-       xfs_iomap_t     *iomapp,
-       int             imaps,                  /* Number of imap entries */
-       int             iomaps,                 /* Number of iomap entries */
-       int             flags)
-{
-       xfs_mount_t     *mp;
-       xfs_fsize_t     nisize;
-       int             pbm;
-       xfs_fsblock_t   start_block;
-
-       mp = io->io_mount;
-       nisize = XFS_SIZE(mp, io);
-       if (io->io_new_size > nisize)
-               nisize = io->io_new_size;
-
-       for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
-               iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ?
-                       mp->m_rtdev_targp : mp->m_ddev_targp;
-               iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-               iomapp->iomap_delta = offset - iomapp->iomap_offset;
-               iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
-               iomapp->iomap_flags = flags;
-
-               start_block = imap->br_startblock;
-               if (start_block == HOLESTARTBLOCK) {
-                       iomapp->iomap_bn = IOMAP_DADDR_NULL;
-                       iomapp->iomap_flags = IOMAP_HOLE;
-               } else if (start_block == DELAYSTARTBLOCK) {
-                       iomapp->iomap_bn = IOMAP_DADDR_NULL;
-                       iomapp->iomap_flags = IOMAP_DELAY;
-               } else {
-                       iomapp->iomap_bn = XFS_FSB_TO_DB_IO(io, start_block);
-                       if (ISUNWRITTEN(imap))
-                               iomapp->iomap_flags |= IOMAP_UNWRITTEN;
-               }
-
-               if ((iomapp->iomap_offset + iomapp->iomap_bsize) >= nisize) {
-                       iomapp->iomap_flags |= IOMAP_EOF;
-               }
-
-               offset += iomapp->iomap_bsize - iomapp->iomap_delta;
-       }
-       return pbm;     /* Return the number filled */
-}
-
-int
-xfs_iomap(
-       xfs_iocore_t    *io,
-       xfs_off_t       offset,
-       ssize_t         count,
-       int             flags,
-       xfs_iomap_t     *iomapp,
-       int             *niomaps)
-{
-       xfs_mount_t     *mp = io->io_mount;
-       xfs_fileoff_t   offset_fsb, end_fsb;
-       int             error = 0;
-       int             lockmode = 0;
-       xfs_bmbt_irec_t imap;
-       int             nimaps = 1;
-       int             bmapi_flags = 0;
-       int             iomap_flags = 0;
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       switch (flags &
-               (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE |
-                BMAPI_UNWRITTEN | BMAPI_DEVICE)) {
-       case BMAPI_READ:
-               lockmode = XFS_LCK_MAP_SHARED(mp, io);
-               bmapi_flags = XFS_BMAPI_ENTIRE;
-               if (flags & BMAPI_IGNSTATE)
-                       bmapi_flags |= XFS_BMAPI_IGSTATE;
-               break;
-       case BMAPI_WRITE:
-               lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR;
-               bmapi_flags = 0;
-               XFS_ILOCK(mp, io, lockmode);
-               break;
-       case BMAPI_ALLOCATE:
-               lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD;
-               bmapi_flags = XFS_BMAPI_ENTIRE;
-               /* Attempt non-blocking lock */
-               if (flags & BMAPI_TRYLOCK) {
-                       if (!XFS_ILOCK_NOWAIT(mp, io, lockmode))
-                               return XFS_ERROR(EAGAIN);
-               } else {
-                       XFS_ILOCK(mp, io, lockmode);
-               }
-               break;
-       case BMAPI_UNWRITTEN:
-               goto phase2;
-       case BMAPI_DEVICE:
-               lockmode = XFS_LCK_MAP_SHARED(mp, io);
-               iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ?
-                       mp->m_rtdev_targp : mp->m_ddev_targp;
-               error = 0;
-               *niomaps = 1;
-               goto out;
-       default:
-               BUG();
-       }
-
-       ASSERT(offset <= mp->m_maxioffset);
-       if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
-               count = mp->m_maxioffset - offset;
-       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
-
-       error = XFS_BMAPI(mp, NULL, io, offset_fsb,
-                       (xfs_filblks_t)(end_fsb - offset_fsb),
-                       bmapi_flags,  NULL, 0, &imap,
-                       &nimaps, NULL);
-
-       if (error)
-               goto out;
-
-phase2:
-       switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) {
-       case BMAPI_WRITE:
-               /* If we found an extent, return it */
-               if (nimaps && (imap.br_startblock != HOLESTARTBLOCK))
-                       break;
-
-               if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
-                       error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset,
-                                       count, flags, &imap, &nimaps, nimaps);
-               } else {
-                       error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count,
-                                       flags, &imap, &nimaps);
-               }
-               iomap_flags = IOMAP_NEW;
-               break;
-       case BMAPI_ALLOCATE:
-               /* If we found an extent, return it */
-               XFS_IUNLOCK(mp, io, lockmode);
-               lockmode = 0;
-
-               if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock))
-                       break;
-
-               error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, &imap, &nimaps);
-               break;
-       case BMAPI_UNWRITTEN:
-               lockmode = 0;
-               error = XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count);
-               nimaps = 0;
-               break;
-       }
-
-       if (nimaps) {
-               *niomaps = xfs_imap_to_bmap(io, offset, &imap,
-                                       iomapp, nimaps, *niomaps, iomap_flags);
-       } else if (niomaps) {
-               *niomaps = 0;
-       }
-
-out:
-       if (lockmode)
-               XFS_IUNLOCK(mp, io, lockmode);
-       return XFS_ERROR(error);
-}
-
-STATIC int
-xfs_flush_space(
-       xfs_inode_t     *ip,
-       int             *fsynced,
-       int             *ioflags)
-{
-       switch (*fsynced) {
-       case 0:
-               if (ip->i_delayed_blks) {
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                       xfs_flush_inode(ip);
-                       xfs_ilock(ip, XFS_ILOCK_EXCL);
-                       *fsynced = 1;
-               } else {
-                       *ioflags |= BMAPI_SYNC;
-                       *fsynced = 2;
-               }
-               return 0;
-       case 1:
-               *fsynced = 2;
-               *ioflags |= BMAPI_SYNC;
-               return 0;
-       case 2:
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               xfs_flush_device(ip);
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               *fsynced = 3;
-               return 0;
-       }
-       return 1;
-}
-
-int
-xfs_iomap_write_direct(
-       xfs_inode_t     *ip,
-       loff_t          offset,
-       size_t          count,
-       int             flags,
-       xfs_bmbt_irec_t *ret_imap,
-       int             *nmaps,
-       int             found)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       xfs_iocore_t    *io = &ip->i_iocore;
-       xfs_fileoff_t   offset_fsb;
-       xfs_fileoff_t   last_fsb;
-       xfs_filblks_t   count_fsb;
-       xfs_fsize_t     isize;
-       xfs_fsblock_t   firstfsb;
-       int             nimaps, maps;
-       int             error;
-       int             bmapi_flag;
-       int             rt;
-       xfs_trans_t     *tp;
-       xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp;
-       xfs_bmap_free_t free_list;
-       int             aeof;
-       xfs_filblks_t   datablocks;
-       int             committed;
-       int             numrtextents;
-       uint            resblks;
-
-       /*
-        * Make sure that the dquots are there. This doesn't hold
-        * the ilock across a disk read.
-        */
-
-       error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED);
-       if (error)
-               return XFS_ERROR(error);
-
-       maps = min(XFS_WRITE_IMAPS, *nmaps);
-       nimaps = maps;
-
-       isize = ip->i_d.di_size;
-       aeof = (offset + count) > isize;
-
-       if (io->io_new_size > isize)
-               isize = io->io_new_size;
-
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
-       last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-       count_fsb = last_fsb - offset_fsb;
-       if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) {
-               xfs_fileoff_t   map_last_fsb;
-
-               map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff;
-
-               if (map_last_fsb < last_fsb) {
-                       last_fsb = map_last_fsb;
-                       count_fsb = last_fsb - offset_fsb;
-               }
-               ASSERT(count_fsb > 0);
-       }
-
-       /*
-        * determine if reserving space on
-        * the data or realtime partition.
-        */
-       if ((rt = XFS_IS_REALTIME_INODE(ip))) {
-               int     sbrtextsize, iprtextsize;
-
-               sbrtextsize = mp->m_sb.sb_rextsize;
-               iprtextsize =
-                       ip->i_d.di_extsize ? ip->i_d.di_extsize : sbrtextsize;
-               numrtextents = (count_fsb + iprtextsize - 1);
-               do_div(numrtextents, sbrtextsize);
-               datablocks = 0;
-       } else {
-               datablocks = count_fsb;
-               numrtextents = 0;
-       }
-
-       /*
-        * allocate and setup the transaction
-        */
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-
-       resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
-
-       error = xfs_trans_reserve(tp, resblks,
-                       XFS_WRITE_LOG_RES(mp), numrtextents,
-                       XFS_TRANS_PERM_LOG_RES,
-                       XFS_WRITE_LOG_COUNT);
-
-       /*
-        * check for running out of space
-        */
-       if (error)
-               /*
-                * Free the transaction structure.
-                */
-               xfs_trans_cancel(tp, 0);
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-       if (error)
-               goto error_out; /* Don't return in above if .. trans ..,
-                                       need lock to return */
-
-       if (XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, resblks)) {
-               error = (EDQUOT);
-               goto error1;
-       }
-       nimaps = 1;
-
-       bmapi_flag = XFS_BMAPI_WRITE;
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-       xfs_trans_ihold(tp, ip);
-
-       if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt))
-               bmapi_flag |= XFS_BMAPI_PREALLOC;
-
-       /*
-        * issue the bmapi() call to allocate the blocks
-        */
-       XFS_BMAP_INIT(&free_list, &firstfsb);
-       imapp = &imap[0];
-       error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
-               bmapi_flag, &firstfsb, 0, imapp, &nimaps, &free_list);
-       if (error) {
-               goto error0;
-       }
-
-       /*
-        * complete the transaction
-        */
-
-       error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
-       if (error) {
-               goto error0;
-       }
-
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
-       if (error) {
-               goto error_out;
-       }
-
-       /* copy any maps to caller's array and return any error. */
-       if (nimaps == 0) {
-               error = (ENOSPC);
-               goto error_out;
-       }
-
-       *ret_imap = imap[0];
-       *nmaps = 1;
-       return 0;
-
- error0:       /* Cancel bmap, unlock inode, and cancel trans */
-       xfs_bmap_cancel(&free_list);
-
- error1:       /* Just cancel transaction */
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-       *nmaps = 0;     /* nothing set-up here */
-
-error_out:
-       return XFS_ERROR(error);
-}
-
-int
-xfs_iomap_write_delay(
-       xfs_inode_t     *ip,
-       loff_t          offset,
-       size_t          count,
-       int             ioflag,
-       xfs_bmbt_irec_t *ret_imap,
-       int             *nmaps)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       xfs_iocore_t    *io = &ip->i_iocore;
-       xfs_fileoff_t   offset_fsb;
-       xfs_fileoff_t   last_fsb;
-       xfs_fsize_t     isize;
-       xfs_fsblock_t   firstblock;
-       int             nimaps;
-       int             error;
-       xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-       int             aeof;
-       int             fsynced = 0;
-
-       ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
-
-       /*
-        * Make sure that the dquots are there. This doesn't hold
-        * the ilock across a disk read.
-        */
-
-       error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
-       if (error)
-               return XFS_ERROR(error);
-
-retry:
-       isize = ip->i_d.di_size;
-       if (io->io_new_size > isize) {
-               isize = io->io_new_size;
-       }
-
-       aeof = 0;
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
-       last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-       /*
-        * If the caller is doing a write at the end of the file,
-        * then extend the allocation (and the buffer used for the write)
-        * out to the file system's write iosize.  We clean up any extra
-        * space left over when the file is closed in xfs_inactive().
-        *
-        * We don't bother with this for sync writes, because we need
-        * to minimize the amount we write for good performance.
-        */
-       if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) {
-               xfs_off_t       aligned_offset;
-               unsigned int    iosize;
-               xfs_fileoff_t   ioalign;
-
-               iosize = mp->m_writeio_blocks;
-               aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
-               ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-               last_fsb = ioalign + iosize;
-               aeof = 1;
-       }
-
-       nimaps = XFS_WRITE_IMAPS;
-       firstblock = NULLFSBLOCK;
-
-       /*
-        * roundup the allocation request to m_dalign boundary if file size
-        * is greater that 512K and we are allocating past the allocation eof
-        */
-       if (mp->m_dalign && (isize >= mp->m_dalign) && aeof) {
-               int eof;
-               xfs_fileoff_t new_last_fsb;
-               new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
-               error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
-               if (error) {
-                       return error;
-               }
-               if (eof) {
-                       last_fsb = new_last_fsb;
-               }
-       }
-
-       error = xfs_bmapi(NULL, ip, offset_fsb,
-                         (xfs_filblks_t)(last_fsb - offset_fsb),
-                         XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
-                         XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
-                         &nimaps, NULL);
-       /*
-        * This can be EDQUOT, if nimaps == 0
-        */
-       if (error && (error != ENOSPC)) {
-               return XFS_ERROR(error);
-       }
-       /*
-        * If bmapi returned us nothing, and if we didn't get back EDQUOT,
-        * then we must have run out of space.
-        */
-
-       if (nimaps == 0) {
-               if (xfs_flush_space(ip, &fsynced, &ioflag))
-                       return XFS_ERROR(ENOSPC);
-
-               error = 0;
-               goto retry;
-       }
-
-       *ret_imap = imap[0];
-       *nmaps = 1;
-       return 0;
-}
-
-/*
- * Pass in a delayed allocate extent, convert it to real extents;
- * return to the caller the extent we create which maps on top of
- * the originating callers request.
- *
- * Called without a lock on the inode.
- */
-int
-xfs_iomap_write_allocate(
-       xfs_inode_t     *ip,
-       xfs_bmbt_irec_t *map,
-       int             *retmap)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       xfs_fileoff_t   offset_fsb, last_block;
-       xfs_fileoff_t   end_fsb, map_start_fsb;
-       xfs_fsblock_t   first_block;
-       xfs_bmap_free_t free_list;
-       xfs_filblks_t   count_fsb;
-       xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS];
-       xfs_trans_t     *tp;
-       int             i, nimaps, committed;
-       int             error = 0;
-       int             nres;
-
-       *retmap = 0;
-
-       /*
-        * Make sure that the dquots are there.
-        */
-
-       if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
-               return XFS_ERROR(error);
-
-       offset_fsb = map->br_startoff;
-       count_fsb = map->br_blockcount;
-       map_start_fsb = offset_fsb;
-
-       XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
-
-       while (count_fsb != 0) {
-               /*
-                * Set up a transaction with which to allocate the
-                * backing store for the file.  Do allocations in a
-                * loop until we get some space in the range we are
-                * interested in.  The other space that might be allocated
-                * is in the delayed allocation extent on which we sit
-                * but before our buffer starts.
-                */
-
-               nimaps = 0;
-               while (nimaps == 0) {
-                       tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
-                       nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-                       error = xfs_trans_reserve(tp, nres,
-                                       XFS_WRITE_LOG_RES(mp),
-                                       0, XFS_TRANS_PERM_LOG_RES,
-                                       XFS_WRITE_LOG_COUNT);
-
-                       if (error == ENOSPC) {
-                               error = xfs_trans_reserve(tp, 0,
-                                               XFS_WRITE_LOG_RES(mp),
-                                               0,
-                                               XFS_TRANS_PERM_LOG_RES,
-                                               XFS_WRITE_LOG_COUNT);
-                       }
-                       if (error) {
-                               xfs_trans_cancel(tp, 0);
-                               return XFS_ERROR(error);
-                       }
-                       xfs_ilock(ip, XFS_ILOCK_EXCL);
-                       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-                       xfs_trans_ihold(tp, ip);
-
-                       XFS_BMAP_INIT(&free_list, &first_block);
-
-                       nimaps = XFS_STRAT_WRITE_IMAPS;
-                       /*
-                        * Ensure we don't go beyond eof - it is possible
-                        * the extents changed since we did the read call,
-                        * we dropped the ilock in the interim.
-                        */
-
-                       end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size);
-                       xfs_bmap_last_offset(NULL, ip, &last_block,
-                               XFS_DATA_FORK);
-                       last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
-                       if ((map_start_fsb + count_fsb) > last_block) {
-                               count_fsb = last_block - map_start_fsb;
-                               if (count_fsb == 0) {
-                                       error = EAGAIN;
-                                       goto trans_cancel;
-                               }
-                       }
-
-                       /* Go get the actual blocks */
-                       error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
-                                       XFS_BMAPI_WRITE, &first_block, 1,
-                                       imap, &nimaps, &free_list);
-
-                       if (error)
-                               goto trans_cancel;
-
-                       error = xfs_bmap_finish(&tp, &free_list,
-                                       first_block, &committed);
-
-                       if (error)
-                               goto trans_cancel;
-
-                       error = xfs_trans_commit(tp,
-                                       XFS_TRANS_RELEASE_LOG_RES, NULL);
-
-                       if (error)
-                               goto error0;
-
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               }
-
-               /*
-                * See if we were able to allocate an extent that
-                * covers at least part of the callers request
-                */
-
-               for (i = 0; i < nimaps; i++) {
-                       if ((map->br_startoff >= imap[i].br_startoff) &&
-                           (map->br_startoff < (imap[i].br_startoff +
-                                                imap[i].br_blockcount))) {
-                               *map = imap[i];
-                               *retmap = 1;
-                               XFS_STATS_INC(xs_xstrat_quick);
-                               return 0;
-                       }
-                       count_fsb -= imap[i].br_blockcount;
-               }
-
-               /* So far we have not mapped the requested part of the
-                * file, just surrounding data, try again.
-                */
-               nimaps--;
-               offset_fsb = imap[nimaps].br_startoff +
-                            imap[nimaps].br_blockcount;
-               map_start_fsb = offset_fsb;
-       }
-
-trans_cancel:
-       xfs_bmap_cancel(&free_list);
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-error0:
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return XFS_ERROR(error);
-}
-
-int
-xfs_iomap_write_unwritten(
-       xfs_inode_t     *ip,
-       loff_t          offset,
-       size_t          count)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       xfs_trans_t     *tp;
-       xfs_fileoff_t   offset_fsb;
-       xfs_filblks_t   count_fsb;
-       xfs_filblks_t   numblks_fsb;
-       xfs_bmbt_irec_t imap;
-       int             committed;
-       int             error;
-       int             nres;
-       int             nimaps;
-       xfs_fsblock_t   firstfsb;
-       xfs_bmap_free_t free_list;
-
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
-       count_fsb = XFS_B_TO_FSB(mp, count);
-
-       do {
-               nres = XFS_DIOSTRAT_SPACE_RES(mp, 0);
-
-               /*
-                * set up a transaction to convert the range of extents
-                * from unwritten to real. Do allocations in a loop until
-                * we have covered the range passed in.
-                */
-
-               tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
-               error = xfs_trans_reserve(tp, nres,
-                               XFS_WRITE_LOG_RES(mp), 0,
-                               XFS_TRANS_PERM_LOG_RES,
-                               XFS_WRITE_LOG_COUNT);
-               if (error) {
-                       xfs_trans_cancel(tp, 0);
-                       goto error0;
-               }
-
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-               xfs_trans_ihold(tp, ip);
-
-               /*
-                * Modify the unwritten extent state of the buffer.
-                */
-               XFS_BMAP_INIT(&free_list, &firstfsb);
-               nimaps = 1;
-               error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
-                                 XFS_BMAPI_WRITE, &firstfsb,
-                                 1, &imap, &nimaps, &free_list);
-               if (error)
-                       goto error_on_bmapi_transaction;
-
-               error = xfs_bmap_finish(&(tp), &(free_list),
-                               firstfsb, &committed);
-               if (error)
-                       goto error_on_bmapi_transaction;
-
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               if (error)
-                       goto error0;
-
-               if ((numblks_fsb = imap.br_blockcount) == 0) {
-                       /*
-                        * The numblks_fsb value should always get
-                        * smaller, otherwise the loop is stuck.
-                        */
-                       ASSERT(imap.br_blockcount);
-                       break;
-               }
-               offset_fsb += numblks_fsb;
-               count_fsb -= numblks_fsb;
-       } while (count_fsb > 0);
-
-       return 0;
-
-error_on_bmapi_transaction:
-       xfs_bmap_cancel(&free_list);
-       xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-error0:
-       return XFS_ERROR(error);
-}
index 39b2fff855e3552356613fc6f312f13d7b2930e4..1cf06f8ec8ff55b9c4cfd6d8c9311bc3e89662ea 100644 (file)
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 
-#include <linux/xfs_behavior.h>
-#include <linux/xfs_vfs.h>
-#include <linux/xfs_cred.h>
-#include <linux/xfs_vnode.h>
-#include <linux/xfs_stats.h>
-#include <linux/xfs_sysctl.h>
-#include <linux/xfs_iops.h>
-#include <linux/xfs_super.h>
-#include <linux/xfs_globals.h>
-#include <linux/xfs_fs_subr.h>
-#include <linux/xfs_lrw.h>
-
-#include <pagebuf/page_buf.h>
+#include <mrlock.h>
+#include <spin.h>
+#include <sv.h>
+#include <mutex.h>
+#include <sema.h>
+#include <time.h>
+#include <kmem.h>
+
+#include <xfs_behavior.h>
+#include <xfs_vfs.h>
+#include <xfs_cred.h>
+#include <xfs_vnode.h>
+#include <xfs_stats.h>
+#include <xfs_sysctl.h>
+#include <xfs_iops.h>
+#include <xfs_super.h>
+#include <xfs_globals.h>
+#include <xfs_fs_subr.h>
+#include <xfs_lrw.h>
+#include <xfs_buf.h>
 
 /*
  * Feature macros (disable/enable)
diff --git a/fs/xfs/pagebuf/page_buf.c b/fs/xfs/pagebuf/page_buf.c
deleted file mode 100644 (file)
index 5508239..0000000
+++ /dev/null
@@ -1,2107 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-
-/*
- *     page_buf.c
- *
- *     The page_buf module provides an abstract buffer cache model on top of
- *     the Linux page cache.  Cached metadata blocks for a file system are
- *     hashed to the inode for the block device.  The page_buf module
- *     assembles buffer (page_buf_t) objects on demand to aggregate such
- *     cached pages for I/O.
- *
- *
- *      Written by Steve Lord, Jim Mostek, Russell Cattelan
- *                 and Rajagopal Ananthanarayanan ("ananth") at SGI.
- *
- */
-
-#include <linux/module.h>
-#include <linux/stddef.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
-#include <linux/bio.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/workqueue.h>
-#include <linux/suspend.h>
-#include <linux/percpu.h>
-
-#include <support/ktrace.h>
-#include <support/debug.h>
-#include <support/kmem.h>
-
-#include "page_buf.h"
-
-#define BBSHIFT                9
-#define BN_ALIGN_MASK  ((1 << (PAGE_CACHE_SHIFT - BBSHIFT)) - 1)
-
-#ifndef GFP_READAHEAD
-#define GFP_READAHEAD  (__GFP_NOWARN|__GFP_NORETRY)
-#endif
-
-/*
- * File wide globals
- */
-
-STATIC kmem_cache_t *pagebuf_cache;
-STATIC void pagebuf_daemon_wakeup(int);
-STATIC void pagebuf_delwri_queue(page_buf_t *, int);
-STATIC struct workqueue_struct *pagebuf_logio_workqueue;
-STATIC struct workqueue_struct *pagebuf_dataio_workqueue;
-
-/*
- * Pagebuf module configuration parameters, exported via
- * /proc/sys/vm/pagebuf
- */
-
-typedef struct pb_sysctl_val {
-       int     min;
-       int     val;
-       int     max;
-} pb_sysctl_val_t;
-
-struct {
-       pb_sysctl_val_t flush_interval; /* interval between runs of the
-                                        * delwri flush daemon.  */
-       pb_sysctl_val_t age_buffer;     /* time for buffer to age before
-                                        * we flush it.  */
-       pb_sysctl_val_t stats_clear;    /* clear the pagebuf stats */
-       pb_sysctl_val_t debug;          /* debug tracing on or off */
-} pb_params = {
-                         /*    MIN     DFLT    MAX     */
-       .flush_interval = {     HZ/2,   HZ,     30*HZ   },
-       .age_buffer     = {     1*HZ,   15*HZ,  300*HZ  },
-       .stats_clear    = {     0,      0,      1       },
-       .debug          = {     0,      0,      1       },
-};
-
-enum {
-       PB_FLUSH_INT = 1,
-       PB_FLUSH_AGE = 2,
-       PB_STATS_CLEAR = 3,
-       PB_DEBUG = 4,
-};
-
-/*
- * Pagebuf statistics variables
- */
-
-struct pbstats {
-       u_int32_t       pb_get;
-       u_int32_t       pb_create;
-       u_int32_t       pb_get_locked;
-       u_int32_t       pb_get_locked_waited;
-       u_int32_t       pb_busy_locked;
-       u_int32_t       pb_miss_locked;
-       u_int32_t       pb_page_retries;
-       u_int32_t       pb_page_found;
-       u_int32_t       pb_get_read;
-} pbstats;
-DEFINE_PER_CPU(struct pbstats, pbstats);
-
-/* We don't disable preempt, not too worried about poking the
- * wrong cpu's stat for now */
-#define PB_STATS_INC(count)    (__get_cpu_var(pbstats).count++)
-
-/*
- * Pagebuf debugging
- */
-
-#ifdef PAGEBUF_TRACE
-void
-pagebuf_trace(
-       page_buf_t      *pb,
-       char            *id,
-       void            *data,
-       void            *ra)
-{
-       if (!pb_params.debug.val)
-               return;
-       ktrace_enter(pagebuf_trace_buf,
-               pb, id,
-               (void *)(unsigned long)pb->pb_flags,
-               (void *)(unsigned long)pb->pb_hold.counter,
-               (void *)(unsigned long)pb->pb_sema.count.counter,
-               (void *)current,
-               data, ra,
-               (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
-               (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
-               (void *)(unsigned long)pb->pb_buffer_length,
-               NULL, NULL, NULL, NULL, NULL);
-}
-ktrace_t *pagebuf_trace_buf;
-EXPORT_SYMBOL(pagebuf_trace_buf);
-#define PAGEBUF_TRACE_SIZE     4096
-#define PB_TRACE(pb, id, data) \
-       pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))
-#else
-#define PB_TRACE(pb, id, data) do { } while (0)
-#endif
-
-#ifdef PAGEBUF_LOCK_TRACKING
-# define PB_SET_OWNER(pb)      ((pb)->pb_last_holder = current->pid)
-# define PB_CLEAR_OWNER(pb)    ((pb)->pb_last_holder = -1)
-# define PB_GET_OWNER(pb)      ((pb)->pb_last_holder)
-#else
-# define PB_SET_OWNER(pb)      do { } while (0)
-# define PB_CLEAR_OWNER(pb)    do { } while (0)
-# define PB_GET_OWNER(pb)      do { } while (0)
-#endif
-
-/*
- * Pagebuf allocation / freeing.
- */
-
-#define pb_to_gfp(flags) \
-       (((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \
-        ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL)
-
-#define pagebuf_allocate(flags) \
-       kmem_cache_alloc(pagebuf_cache, pb_to_gfp(flags))
-#define pagebuf_deallocate(pb) \
-       kmem_cache_free(pagebuf_cache, (pb));
-
-/*
- * Pagebuf hashing
- */
-
-#define NBITS  8
-#define NHASH  (1<<NBITS)
-
-typedef struct {
-       struct list_head        pb_hash;
-       int                     pb_count;
-       spinlock_t              pb_hash_lock;
-} pb_hash_t;
-
-STATIC pb_hash_t       pbhash[NHASH];
-#define pb_hash(pb)    &pbhash[pb->pb_hash_index]
-
-STATIC int
-_bhash(
-       struct block_device *bdev,
-       loff_t          base)
-{
-       int             bit, hval;
-
-       base >>= 9;
-       base ^= (unsigned long)bdev / L1_CACHE_BYTES;
-       for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) {
-               hval ^= (int)base & (NHASH-1);
-               base >>= NBITS;
-       }
-       return hval;
-}
-
-/*
- * Mapping of multi-page buffers into contiguous virtual space
- */
-
-STATIC void *pagebuf_mapout_locked(page_buf_t *);
-
-typedef struct a_list {
-       void            *vm_addr;
-       struct a_list   *next;
-} a_list_t;
-
-STATIC a_list_t                *as_free_head;
-STATIC int             as_list_len;
-STATIC spinlock_t      as_lock = SPIN_LOCK_UNLOCKED;
-
-/*
- * Try to batch vunmaps because they are costly.
- */
-STATIC void
-free_address(
-       void            *addr)
-{
-       a_list_t        *aentry;
-
-       aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC);
-       if (aentry) {
-               spin_lock(&as_lock);
-               aentry->next = as_free_head;
-               aentry->vm_addr = addr;
-               as_free_head = aentry;
-               as_list_len++;
-               spin_unlock(&as_lock);
-       } else {
-               vunmap(addr);
-       }
-}
-
-STATIC void
-purge_addresses(void)
-{
-       a_list_t        *aentry, *old;
-
-       if (as_free_head == NULL)
-               return;
-
-       spin_lock(&as_lock);
-       aentry = as_free_head;
-       as_free_head = NULL;
-       as_list_len = 0;
-       spin_unlock(&as_lock);
-
-       while ((old = aentry) != NULL) {
-               vunmap(aentry->vm_addr);
-               aentry = aentry->next;
-               kfree(old);
-       }
-}
-
-/*
- *     Internal pagebuf object manipulation
- */
-
-STATIC void
-_pagebuf_initialize(
-       page_buf_t              *pb,
-       pb_target_t             *target,
-       loff_t                  range_base,
-       size_t                  range_length,
-       page_buf_flags_t        flags)
-{
-       /*
-        * We don't want certain flags to appear in pb->pb_flags.
-        */
-       flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD);
-
-       memset(pb, 0, sizeof(page_buf_t));
-       atomic_set(&pb->pb_hold, 1);
-       init_MUTEX_LOCKED(&pb->pb_iodonesema);
-       INIT_LIST_HEAD(&pb->pb_list);
-       INIT_LIST_HEAD(&pb->pb_hash_list);
-       init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
-       PB_SET_OWNER(pb);
-       pb->pb_target = target;
-       pb->pb_file_offset = range_base;
-       /*
-        * Set buffer_length and count_desired to the same value initially.
-        * IO routines should use count_desired, which will be the same in
-        * most cases but may be reset (e.g. XFS recovery).
-        */
-       pb->pb_buffer_length = pb->pb_count_desired = range_length;
-       pb->pb_flags = flags | PBF_NONE;
-       pb->pb_bn = PAGE_BUF_DADDR_NULL;
-       atomic_set(&pb->pb_pin_count, 0);
-       init_waitqueue_head(&pb->pb_waiters);
-
-       PB_STATS_INC(pb_create);
-       PB_TRACE(pb, "initialize", target);
-}
-
-/*
- * Allocate a page array capable of holding a specified number
- * of pages, and point the page buf at it.
- */
-STATIC int
-_pagebuf_get_pages(
-       page_buf_t              *pb,
-       int                     page_count,
-       page_buf_flags_t        flags)
-{
-       int                     gpf_mask = pb_to_gfp(flags);
-
-       /* Make sure that we have a page list */
-       if (pb->pb_pages == NULL) {
-               pb->pb_offset = page_buf_poff(pb->pb_file_offset);
-               pb->pb_page_count = page_count;
-               if (page_count <= PB_PAGES) {
-                       pb->pb_pages = pb->pb_page_array;
-               } else {
-                       pb->pb_pages = kmalloc(sizeof(struct page *) *
-                                       page_count, gpf_mask);
-                       if (pb->pb_pages == NULL)
-                               return -ENOMEM;
-               }
-               memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
-       }
-       return 0;
-}
-
-/*
- * Walk a pagebuf releasing all the pages contained within it.
- */
-STATIC inline void
-_pagebuf_freepages(
-       page_buf_t              *pb)
-{
-       int                     buf_index;
-
-       for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) {
-               struct page     *page = pb->pb_pages[buf_index];
-
-               if (page) {
-                       pb->pb_pages[buf_index] = NULL;
-                       page_cache_release(page);
-               }
-       }
-}
-
-/*
- *     _pagebuf_free_object
- *
- *     _pagebuf_free_object releases the contents specified buffer.
- *     The modification state of any associated pages is left unchanged.
- */
-void
-_pagebuf_free_object(
-       pb_hash_t               *hash,  /* hash bucket for buffer */
-       page_buf_t              *pb)    /* buffer to deallocate */
-{
-       page_buf_flags_t        pb_flags = pb->pb_flags;
-
-       PB_TRACE(pb, "free_object", 0);
-       pb->pb_flags |= PBF_FREED;
-
-       if (hash) {
-               if (!list_empty(&pb->pb_hash_list)) {
-                       hash->pb_count--;
-                       list_del_init(&pb->pb_hash_list);
-               }
-               spin_unlock(&hash->pb_hash_lock);
-       }
-
-       if (!(pb_flags & PBF_FREED)) {
-               /* release any virtual mapping */ ;
-               if (pb->pb_flags & _PBF_ADDR_ALLOCATED) {
-                       void *vaddr = pagebuf_mapout_locked(pb);
-                       if (vaddr) {
-                               free_address(vaddr);
-                       }
-               }
-
-               if (pb->pb_flags & _PBF_MEM_ALLOCATED) {
-                       if (pb->pb_pages) {
-                               /* release the pages in the address list */
-                               if ((pb->pb_pages[0]) &&
-                                   (pb->pb_flags & _PBF_MEM_SLAB)) {
-                                       kfree(pb->pb_addr);
-                               } else {
-                                       _pagebuf_freepages(pb);
-                               }
-                               if (pb->pb_pages != pb->pb_page_array)
-                                       kfree(pb->pb_pages);
-                               pb->pb_pages = NULL;
-                       }
-                       pb->pb_flags &= ~(_PBF_MEM_ALLOCATED|_PBF_MEM_SLAB);
-               }
-       }
-
-       pagebuf_deallocate(pb);
-}
-
-/*
- *     _pagebuf_lookup_pages
- *
- *     _pagebuf_lookup_pages finds all pages which match the buffer
- *     in question and the range of file offsets supplied,
- *     and builds the page list for the buffer, if the
- *     page list is not already formed or if not all of the pages are
- *     already in the list. Invalid pages (pages which have not yet been
- *     read in from disk) are assigned for any pages which are not found.
- */
-STATIC int
-_pagebuf_lookup_pages(
-       page_buf_t              *pb,
-       struct address_space    *aspace,
-       page_buf_flags_t        flags)
-{
-       loff_t                  next_buffer_offset;
-       unsigned long           page_count, pi, index;
-       struct page             *page;
-       int                     gfp_mask, retry_count = 5, rval = 0;
-       int                     all_mapped, good_pages, nbytes;
-       unsigned int            blocksize, sectorshift;
-       size_t                  size, offset;
-
-
-       /* For pagebufs where we want to map an address, do not use
-        * highmem pages - so that we do not need to use kmap resources
-        * to access the data.
-        *
-        * For pages where the caller has indicated there may be resource
-        * contention (e.g. called from a transaction) do not flush
-        * delalloc pages to obtain memory.
-        */
-
-       if (flags & PBF_READ_AHEAD) {
-               gfp_mask = GFP_READAHEAD;
-               retry_count = 0;
-       } else if (flags & PBF_DONT_BLOCK) {
-               gfp_mask = GFP_NOFS;
-       } else if (flags & PBF_MAPPABLE) {
-               gfp_mask = GFP_KERNEL;
-       } else {
-               gfp_mask = GFP_HIGHUSER;
-       }
-
-       next_buffer_offset = pb->pb_file_offset + pb->pb_buffer_length;
-
-       good_pages = page_count = (page_buf_btoc(next_buffer_offset) -
-                                  page_buf_btoct(pb->pb_file_offset));
-
-       if (pb->pb_flags & _PBF_ALL_PAGES_MAPPED) {
-               /* Bring pages forward in cache */
-               for (pi = 0; pi < page_count; pi++) {
-                       mark_page_accessed(pb->pb_pages[pi]);
-               }
-               if ((flags & PBF_MAPPED) && !(pb->pb_flags & PBF_MAPPED)) {
-                       all_mapped = 1;
-                       goto mapit;
-               }
-               return 0;
-       }
-
-       /* Ensure pb_pages field has been initialised */
-       rval = _pagebuf_get_pages(pb, page_count, flags);
-       if (rval)
-               return rval;
-
-       rval = pi = 0;
-       blocksize = pb->pb_target->pbr_bsize;
-       sectorshift = pb->pb_target->pbr_sshift;
-       size = pb->pb_count_desired;
-       offset = pb->pb_offset;
-
-       /* Enter the pages in the page list */
-       index = (pb->pb_file_offset - pb->pb_offset) >> PAGE_CACHE_SHIFT;
-       for (all_mapped = 1; pi < page_count; pi++, index++) {
-               if (pb->pb_pages[pi] == 0) {
-                     retry:
-                       page = find_or_create_page(aspace, index, gfp_mask);
-                       if (!page) {
-                               if (--retry_count > 0) {
-                                       PB_STATS_INC(pb_page_retries);
-                                       pagebuf_daemon_wakeup(1);
-                                       current->state = TASK_UNINTERRUPTIBLE;
-                                       schedule_timeout(10);
-                                       goto retry;
-                               }
-                               rval = -ENOMEM;
-                               all_mapped = 0;
-                               continue;
-                       }
-                       PB_STATS_INC(pb_page_found);
-                       mark_page_accessed(page);
-                       pb->pb_pages[pi] = page;
-               } else {
-                       page = pb->pb_pages[pi];
-                       lock_page(page);
-               }
-
-               nbytes = PAGE_CACHE_SIZE - offset;
-               if (nbytes > size)
-                       nbytes = size;
-               size -= nbytes;
-
-               if (!PageUptodate(page)) {
-                       if (blocksize == PAGE_CACHE_SIZE) {
-                               if (flags & PBF_READ)
-                                       pb->pb_locked = 1;
-                               good_pages--;
-                       } else if (!PagePrivate(page)) {
-                               unsigned long   i, range;
-
-                               /*
-                                * In this case page->private holds a bitmap
-                                * of uptodate sectors within the page
-                                */
-                               ASSERT(blocksize < PAGE_CACHE_SIZE);
-                               range = (offset + nbytes) >> sectorshift;
-                               for (i = offset >> sectorshift; i < range; i++)
-                                       if (!test_bit(i, &page->private))
-                                               break;
-                               if (i != range)
-                                       good_pages--;
-                       } else {
-                               good_pages--;
-                       }
-               }
-               offset = 0;
-       }
-
-       if (!pb->pb_locked) {
-               for (pi = 0; pi < page_count; pi++) {
-                       if (pb->pb_pages[pi])
-                               unlock_page(pb->pb_pages[pi]);
-               }
-       }
-
-mapit:
-       pb->pb_flags |= _PBF_MEM_ALLOCATED;
-       if (all_mapped) {
-               pb->pb_flags |= _PBF_ALL_PAGES_MAPPED;
-
-               /* A single page buffer is always mappable */
-               if (page_count == 1) {
-                       pb->pb_addr = (caddr_t)
-                               page_address(pb->pb_pages[0]) + pb->pb_offset;
-                       pb->pb_flags |= PBF_MAPPED;
-               } else if (flags & PBF_MAPPED) {
-                       if (as_list_len > 64)
-                               purge_addresses();
-                       pb->pb_addr = vmap(pb->pb_pages, page_count,
-                                       VM_MAP, PAGE_KERNEL);
-                       if (pb->pb_addr == NULL)
-                               return -ENOMEM;
-                       pb->pb_addr += pb->pb_offset;
-                       pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED;
-               }
-       }
-       /* If some pages were found with data in them
-        * we are not in PBF_NONE state.
-        */
-       if (good_pages != 0) {
-               pb->pb_flags &= ~(PBF_NONE);
-               if (good_pages != page_count) {
-                       pb->pb_flags |= PBF_PARTIAL;
-               }
-       }
-
-       PB_TRACE(pb, "lookup_pages", (long)good_pages);
-
-       return rval;
-}
-
-/*
- *     Finding and Reading Buffers
- */
-
-/*
- *     _pagebuf_find
- *
- *     Looks up, and creates if absent, a lockable buffer for
- *     a given range of an inode.  The buffer is returned
- *     locked.  If other overlapping buffers exist, they are
- *     released before the new buffer is created and locked,
- *     which may imply that this call will block until those buffers
- *     are unlocked.  No I/O is implied by this call.
- */
-STATIC page_buf_t *
-_pagebuf_find(                         /* find buffer for block        */
-       pb_target_t             *target,/* target for block             */
-       loff_t                  ioff,   /* starting offset of range     */
-       size_t                  isize,  /* length of range              */
-       page_buf_flags_t        flags,  /* PBF_TRYLOCK                  */
-       page_buf_t              *new_pb)/* newly allocated buffer       */
-{
-       loff_t                  range_base;
-       size_t                  range_length;
-       int                     hval;
-       pb_hash_t               *h;
-       struct list_head        *p;
-       page_buf_t              *pb;
-       int                     not_locked;
-
-       range_base = (ioff << BBSHIFT);
-       range_length = (isize << BBSHIFT);
-
-       /* Ensure we never do IOs smaller than the sector size */
-       BUG_ON(range_length < (1 << target->pbr_sshift));
-
-       /* Ensure we never do IOs that are not sector aligned */
-       BUG_ON(range_base & (loff_t)target->pbr_smask);
-
-       hval = _bhash(target->pbr_bdev, range_base);
-       h = &pbhash[hval];
-
-       spin_lock(&h->pb_hash_lock);
-       list_for_each(p, &h->pb_hash) {
-               pb = list_entry(p, page_buf_t, pb_hash_list);
-
-               if ((target == pb->pb_target) &&
-                   (pb->pb_file_offset == range_base) &&
-                   (pb->pb_buffer_length == range_length)) {
-                       if (pb->pb_flags & PBF_FREED)
-                               break;
-                       /* If we look at something bring it to the
-                        * front of the list for next time
-                        */
-                       list_del(&pb->pb_hash_list);
-                       list_add(&pb->pb_hash_list, &h->pb_hash);
-                       goto found;
-               }
-       }
-
-       /* No match found */
-       if (new_pb) {
-               _pagebuf_initialize(new_pb, target, range_base,
-                               range_length, flags | _PBF_LOCKABLE);
-               new_pb->pb_hash_index = hval;
-               h->pb_count++;
-               list_add(&new_pb->pb_hash_list, &h->pb_hash);
-       } else {
-               PB_STATS_INC(pb_miss_locked);
-       }
-
-       spin_unlock(&h->pb_hash_lock);
-       return (new_pb);
-
-found:
-       atomic_inc(&pb->pb_hold);
-       spin_unlock(&h->pb_hash_lock);
-
-       /* Attempt to get the semaphore without sleeping,
-        * if this does not work then we need to drop the
-        * spinlock and do a hard attempt on the semaphore.
-        */
-       not_locked = down_trylock(&pb->pb_sema);
-       if (not_locked) {
-               if (!(flags & PBF_TRYLOCK)) {
-                       /* wait for buffer ownership */
-                       PB_TRACE(pb, "get_lock", 0);
-                       pagebuf_lock(pb);
-                       PB_STATS_INC(pb_get_locked_waited);
-               } else {
-                       /* We asked for a trylock and failed, no need
-                        * to look at file offset and length here, we
-                        * know that this pagebuf at least overlaps our
-                        * pagebuf and is locked, therefore our buffer
-                        * either does not exist, or is this buffer
-                        */
-
-                       pagebuf_rele(pb);
-                       PB_STATS_INC(pb_busy_locked);
-                       return (NULL);
-               }
-       } else {
-               /* trylock worked */
-               PB_SET_OWNER(pb);
-       }
-
-       if (pb->pb_flags & PBF_STALE)
-               pb->pb_flags &= PBF_MAPPABLE | \
-                               PBF_MAPPED | \
-                               _PBF_LOCKABLE | \
-                               _PBF_ALL_PAGES_MAPPED | \
-                               _PBF_ADDR_ALLOCATED | \
-                               _PBF_MEM_ALLOCATED | \
-                               _PBF_MEM_SLAB;
-       PB_TRACE(pb, "got_lock", 0);
-       PB_STATS_INC(pb_get_locked);
-       return (pb);
-}
-
-
-/*
- *     pagebuf_find
- *
- *     pagebuf_find returns a buffer matching the specified range of
- *     data for the specified target, if any of the relevant blocks
- *     are in memory.  The buffer may have unallocated holes, if
- *     some, but not all, of the blocks are in memory.  Even where
- *     pages are present in the buffer, not all of every page may be
- *     valid.
- */
-page_buf_t *
-pagebuf_find(                          /* find buffer for block        */
-                                       /* if the block is in memory    */
-       pb_target_t             *target,/* target for block             */
-       loff_t                  ioff,   /* starting offset of range     */
-       size_t                  isize,  /* length of range              */
-       page_buf_flags_t        flags)  /* PBF_TRYLOCK                  */
-{
-       return _pagebuf_find(target, ioff, isize, flags, NULL);
-}
-
-/*
- *     pagebuf_get
- *
- *     pagebuf_get assembles a buffer covering the specified range.
- *     Some or all of the blocks in the range may be valid.  Storage
- *     in memory for all portions of the buffer will be allocated,
- *     although backing storage may not be.  If PBF_READ is set in
- *     flags, pagebuf_iostart is called also.
- */
-page_buf_t *
-pagebuf_get(                           /* allocate a buffer            */
-       pb_target_t             *target,/* target for buffer            */
-       loff_t                  ioff,   /* starting offset of range     */
-       size_t                  isize,  /* length of range              */
-       page_buf_flags_t        flags)  /* PBF_TRYLOCK                  */
-{
-       page_buf_t              *pb, *new_pb;
-       int                     error;
-
-       new_pb = pagebuf_allocate(flags);
-       if (unlikely(!new_pb))
-               return (NULL);
-
-       pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
-       if (pb != new_pb) {
-               pagebuf_deallocate(new_pb);
-               if (unlikely(!pb))
-                       return (NULL);
-       }
-
-       PB_STATS_INC(pb_get);
-
-       /* fill in any missing pages */
-       error = _pagebuf_lookup_pages(pb, pb->pb_target->pbr_mapping, flags);
-       if (unlikely(error)) {
-               pagebuf_free(pb);
-               return (NULL);
-       }
-
-       /*
-        * Always fill in the block number now, the mapped cases can do
-        * their own overlay of this later.
-        */
-       pb->pb_bn = ioff;
-       pb->pb_count_desired = pb->pb_buffer_length;
-
-       if (flags & PBF_READ) {
-               if (PBF_NOT_DONE(pb)) {
-                       PB_TRACE(pb, "get_read", (unsigned long)flags);
-                       PB_STATS_INC(pb_get_read);
-                       pagebuf_iostart(pb, flags);
-               } else if (flags & PBF_ASYNC) {
-                       PB_TRACE(pb, "get_read_async", (unsigned long)flags);
-                       /*
-                        * Read ahead call which is already satisfied,
-                        * drop the buffer
-                        */
-                       if (flags & (PBF_LOCK | PBF_TRYLOCK))
-                               pagebuf_unlock(pb);
-                       pagebuf_rele(pb);
-                       return NULL;
-               } else {
-                       PB_TRACE(pb, "get_read_done", (unsigned long)flags);
-                       /* We do not want read in the flags */
-                       pb->pb_flags &= ~PBF_READ;
-               }
-       } else {
-               PB_TRACE(pb, "get_write", (unsigned long)flags);
-       }
-       return (pb);
-}
-
-/*
- * Create a skeletal pagebuf (no pages associated with it).
- */
-page_buf_t *
-pagebuf_lookup(
-       struct pb_target        *target,
-       loff_t                  ioff,
-       size_t                  isize,
-       page_buf_flags_t        flags)
-{
-       page_buf_t              *pb;
-
-       pb = pagebuf_allocate(flags);
-       if (pb) {
-               _pagebuf_initialize(pb, target, ioff, isize, flags);
-       }
-       return pb;
-}
-
-/*
- * If we are not low on memory then do the readahead in a deadlock
- * safe manner.
- */
-void
-pagebuf_readahead(
-       pb_target_t             *target,
-       loff_t                  ioff,
-       size_t                  isize,
-       page_buf_flags_t        flags)
-{
-       struct backing_dev_info *bdi;
-
-       bdi = target->pbr_mapping->backing_dev_info;
-       if (bdi_read_congested(bdi))
-               return;
-       if (bdi_write_congested(bdi))
-               return;
-
-       flags |= (PBF_TRYLOCK|PBF_READ|PBF_ASYNC|PBF_MAPPABLE|PBF_READ_AHEAD);
-       pagebuf_get(target, ioff, isize, flags);
-}
-
-page_buf_t *
-pagebuf_get_empty(
-       size_t                  len,
-       pb_target_t             *target)
-{
-       page_buf_t              *pb;
-
-       pb = pagebuf_allocate(_PBF_LOCKABLE);
-       if (pb)
-               _pagebuf_initialize(pb, target, 0, len, _PBF_LOCKABLE);
-       return pb;
-}
-
-static inline struct page *
-mem_to_page(
-       void                    *addr)
-{
-       if (((unsigned long)addr < VMALLOC_START) ||
-           ((unsigned long)addr >= VMALLOC_END)) {
-               return virt_to_page(addr);
-       } else {
-               return vmalloc_to_page(addr);
-       }
-}
-
-int
-pagebuf_associate_memory(
-       page_buf_t              *pb,
-       void                    *mem,
-       size_t                  len)
-{
-       int                     rval;
-       int                     i = 0;
-       size_t                  ptr;
-       size_t                  end, end_cur;
-       off_t                   offset;
-       int                     page_count;
-
-       page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
-       offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
-       if (offset && (len > PAGE_CACHE_SIZE))
-               page_count++;
-
-       /* Free any previous set of page pointers */
-       if (pb->pb_pages && (pb->pb_pages != pb->pb_page_array)) {
-               kfree(pb->pb_pages);
-       }
-       pb->pb_pages = NULL;
-       pb->pb_addr = mem;
-
-       rval = _pagebuf_get_pages(pb, page_count, 0);
-       if (rval)
-               return rval;
-
-       pb->pb_offset = offset;
-       ptr = (size_t) mem & PAGE_CACHE_MASK;
-       end = PAGE_CACHE_ALIGN((size_t) mem + len);
-       end_cur = end;
-       /* set up first page */
-       pb->pb_pages[0] = mem_to_page(mem);
-
-       ptr += PAGE_CACHE_SIZE;
-       pb->pb_page_count = ++i;
-       while (ptr < end) {
-               pb->pb_pages[i] = mem_to_page((void *)ptr);
-               pb->pb_page_count = ++i;
-               ptr += PAGE_CACHE_SIZE;
-       }
-       pb->pb_locked = 0;
-
-       pb->pb_count_desired = pb->pb_buffer_length = len;
-       pb->pb_flags |= PBF_MAPPED;
-
-       return 0;
-}
-
-page_buf_t *
-pagebuf_get_no_daddr(
-       size_t                  len,
-       pb_target_t             *target)
-{
-       int                     rval;
-       void                    *rmem = NULL;
-       page_buf_flags_t        flags = _PBF_LOCKABLE | PBF_FORCEIO;
-       page_buf_t              *pb;
-       size_t                  tlen = 0;
-
-       if (unlikely(len > 0x20000))
-               return NULL;
-
-       pb = pagebuf_allocate(flags);
-       if (!pb)
-               return NULL;
-
-       _pagebuf_initialize(pb, target, 0, len, flags);
-
-       do {
-               if (tlen == 0) {
-                       tlen = len; /* first time */
-               } else {
-                       kfree(rmem); /* free the mem from the previous try */
-                       tlen <<= 1; /* double the size and try again */
-               }
-               if ((rmem = kmalloc(tlen, GFP_KERNEL)) == 0) {
-                       pagebuf_free(pb);
-                       return NULL;
-               }
-       } while ((size_t)rmem != ((size_t)rmem & ~target->pbr_smask));
-
-       if ((rval = pagebuf_associate_memory(pb, rmem, len)) != 0) {
-               kfree(rmem);
-               pagebuf_free(pb);
-               return NULL;
-       }
-       /* otherwise pagebuf_free just ignores it */
-       pb->pb_flags |= (_PBF_MEM_ALLOCATED | _PBF_MEM_SLAB);
-       PB_CLEAR_OWNER(pb);
-       up(&pb->pb_sema);       /* Return unlocked pagebuf */
-
-       PB_TRACE(pb, "no_daddr", rmem);
-
-       return pb;
-}
-
-
-/*
- *     pagebuf_hold
- *
- *     Increment reference count on buffer, to hold the buffer concurrently
- *     with another thread which may release (free) the buffer asynchronously.
- *
- *     Must hold the buffer already to call this function.
- */
-void
-pagebuf_hold(
-       page_buf_t              *pb)
-{
-       atomic_inc(&pb->pb_hold);
-       PB_TRACE(pb, "hold", 0);
-}
-
-/*
- *     pagebuf_free
- *
- *     pagebuf_free releases the specified buffer.  The modification
- *     state of any associated pages is left unchanged.
- */
-void
-pagebuf_free(
-       page_buf_t              *pb)
-{
-       if (pb->pb_flags & _PBF_LOCKABLE) {
-               pb_hash_t       *h = pb_hash(pb);
-
-               spin_lock(&h->pb_hash_lock);
-               _pagebuf_free_object(h, pb);
-       } else {
-               _pagebuf_free_object(NULL, pb);
-       }
-}
-
-/*
- *     pagebuf_rele
- *
- *     pagebuf_rele releases a hold on the specified buffer.  If the
- *     the hold count is 1, pagebuf_rele calls pagebuf_free.
- */
-void
-pagebuf_rele(
-       page_buf_t              *pb)
-{
-       pb_hash_t               *h;
-
-       PB_TRACE(pb, "rele", pb->pb_relse);
-       if (pb->pb_flags & _PBF_LOCKABLE) {
-               h = pb_hash(pb);
-               spin_lock(&h->pb_hash_lock);
-       } else {
-               h = NULL;
-       }
-
-       if (atomic_dec_and_test(&pb->pb_hold)) {
-               int             do_free = 1;
-
-               if (pb->pb_relse) {
-                       atomic_inc(&pb->pb_hold);
-                       if (h)
-                               spin_unlock(&h->pb_hash_lock);
-                       (*(pb->pb_relse)) (pb);
-                       do_free = 0;
-               }
-               if (pb->pb_flags & PBF_DELWRI) {
-                       pb->pb_flags |= PBF_ASYNC;
-                       atomic_inc(&pb->pb_hold);
-                       if (h && do_free)
-                               spin_unlock(&h->pb_hash_lock);
-                       pagebuf_delwri_queue(pb, 0);
-                       do_free = 0;
-               } else if (pb->pb_flags & PBF_FS_MANAGED) {
-                       if (h)
-                               spin_unlock(&h->pb_hash_lock);
-                       do_free = 0;
-               }
-
-               if (do_free) {
-                       _pagebuf_free_object(h, pb);
-               }
-       } else if (h) {
-               spin_unlock(&h->pb_hash_lock);
-       }
-}
-
-
-/*
- *     Mutual exclusion on buffers.  Locking model:
- *
- *     Buffers associated with inodes for which buffer locking
- *     is not enabled are not protected by semaphores, and are
- *     assumed to be exclusively owned by the caller.  There is a
- *     spinlock in the buffer, used by the caller when concurrent
- *     access is possible.
- */
-
-/*
- *     pagebuf_cond_lock
- *
- *     pagebuf_cond_lock locks a buffer object, if it is not already locked.
- *     Note that this in no way
- *     locks the underlying pages, so it is only useful for synchronizing
- *     concurrent use of page buffer objects, not for synchronizing independent
- *     access to the underlying pages.
- */
-int
-pagebuf_cond_lock(                     /* lock buffer, if not locked   */
-                                       /* returns -EBUSY if locked)    */
-       page_buf_t              *pb)
-{
-       int                     locked;
-
-       ASSERT(pb->pb_flags & _PBF_LOCKABLE);
-       locked = down_trylock(&pb->pb_sema) == 0;
-       if (locked) {
-               PB_SET_OWNER(pb);
-       }
-       PB_TRACE(pb, "cond_lock", (long)locked);
-       return(locked ? 0 : -EBUSY);
-}
-
-/*
- *     pagebuf_lock_value
- *
- *     Return lock value for a pagebuf
- */
-int
-pagebuf_lock_value(
-       page_buf_t              *pb)
-{
-       ASSERT(pb->pb_flags & _PBF_LOCKABLE);
-       return(atomic_read(&pb->pb_sema.count));
-}
-
-/*
- *     pagebuf_lock
- *
- *     pagebuf_lock locks a buffer object.  Note that this in no way
- *     locks the underlying pages, so it is only useful for synchronizing
- *     concurrent use of page buffer objects, not for synchronizing independent
- *     access to the underlying pages.
- */
-int
-pagebuf_lock(
-       page_buf_t              *pb)
-{
-       ASSERT(pb->pb_flags & _PBF_LOCKABLE);
-
-       PB_TRACE(pb, "lock", 0);
-       if (atomic_read(&pb->pb_io_remaining))
-               blk_run_queues();
-       down(&pb->pb_sema);
-       PB_SET_OWNER(pb);
-       PB_TRACE(pb, "locked", 0);
-       return 0;
-}
-
-/*
- *     pagebuf_unlock
- *
- *     pagebuf_unlock releases the lock on the buffer object created by
- *     pagebuf_lock or pagebuf_cond_lock (not any
- *     pinning of underlying pages created by pagebuf_pin).
- */
-void
-pagebuf_unlock(                                /* unlock buffer                */
-       page_buf_t              *pb)    /* buffer to unlock             */
-{
-       ASSERT(pb->pb_flags & _PBF_LOCKABLE);
-       PB_CLEAR_OWNER(pb);
-       up(&pb->pb_sema);
-       PB_TRACE(pb, "unlock", 0);
-}
-
-
-/*
- *     Pinning Buffer Storage in Memory
- */
-
-/*
- *     pagebuf_pin
- *
- *     pagebuf_pin locks all of the memory represented by a buffer in
- *     memory.  Multiple calls to pagebuf_pin and pagebuf_unpin, for
- *     the same or different buffers affecting a given page, will
- *     properly count the number of outstanding "pin" requests.  The
- *     buffer may be released after the pagebuf_pin and a different
- *     buffer used when calling pagebuf_unpin, if desired.
- *     pagebuf_pin should be used by the file system when it wants be
- *     assured that no attempt will be made to force the affected
- *     memory to disk.  It does not assure that a given logical page
- *     will not be moved to a different physical page.
- */
-void
-pagebuf_pin(
-       page_buf_t              *pb)
-{
-       atomic_inc(&pb->pb_pin_count);
-       PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
-}
-
-/*
- *     pagebuf_unpin
- *
- *     pagebuf_unpin reverses the locking of memory performed by
- *     pagebuf_pin.  Note that both functions affected the logical
- *     pages associated with the buffer, not the buffer itself.
- */
-void
-pagebuf_unpin(
-       page_buf_t              *pb)
-{
-       if (atomic_dec_and_test(&pb->pb_pin_count)) {
-               wake_up_all(&pb->pb_waiters);
-       }
-       PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
-}
-
-int
-pagebuf_ispin(
-       page_buf_t              *pb)
-{
-       return atomic_read(&pb->pb_pin_count);
-}
-
-/*
- *     pagebuf_wait_unpin
- *
- *     pagebuf_wait_unpin waits until all of the memory associated
- *     with the buffer is not longer locked in memory.  It returns
- *     immediately if none of the affected pages are locked.
- */
-static inline void
-_pagebuf_wait_unpin(
-       page_buf_t              *pb)
-{
-       DECLARE_WAITQUEUE       (wait, current);
-
-       if (atomic_read(&pb->pb_pin_count) == 0)
-               return;
-
-       add_wait_queue(&pb->pb_waiters, &wait);
-       for (;;) {
-               current->state = TASK_UNINTERRUPTIBLE;
-               if (atomic_read(&pb->pb_pin_count) == 0)
-                       break;
-               if (atomic_read(&pb->pb_io_remaining))
-                       blk_run_queues();
-               schedule();
-       }
-       remove_wait_queue(&pb->pb_waiters, &wait);
-       current->state = TASK_RUNNING;
-}
-
-/*
- *     Buffer Utility Routines
- */
-
-/*
- *     pagebuf_iodone
- *
- *     pagebuf_iodone marks a buffer for which I/O is in progress
- *     done with respect to that I/O.  The pb_iodone routine, if
- *     present, will be called as a side-effect.
- */
-void
-pagebuf_iodone_work(
-       void                    *v)
-{
-       page_buf_t              *pb = (page_buf_t *)v;
-
-       if (pb->pb_iodone) {
-               (*(pb->pb_iodone)) (pb);
-               return;
-       }
-
-       if (pb->pb_flags & PBF_ASYNC) {
-               if ((pb->pb_flags & _PBF_LOCKABLE) && !pb->pb_relse)
-                       pagebuf_unlock(pb);
-               pagebuf_rele(pb);
-       }
-}
-
-void
-pagebuf_iodone(
-       page_buf_t              *pb,
-       int                     dataio,
-       int                     schedule)
-{
-       pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
-       if (pb->pb_error == 0) {
-               pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE);
-       }
-
-       PB_TRACE(pb, "iodone", pb->pb_iodone);
-
-       if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
-               if (schedule) {
-                       INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb);
-                       queue_work(dataio ? pagebuf_dataio_workqueue :
-                               pagebuf_logio_workqueue, &pb->pb_iodone_work);
-               } else {
-                       pagebuf_iodone_work(pb);
-               }
-       } else {
-               up(&pb->pb_iodonesema);
-       }
-}
-
-/*
- *     pagebuf_ioerror
- *
- *     pagebuf_ioerror sets the error code for a buffer.
- */
-void
-pagebuf_ioerror(                       /* mark/clear buffer error flag */
-       page_buf_t              *pb,    /* buffer to mark               */
-       unsigned int            error)  /* error to store (0 if none)   */
-{
-       pb->pb_error = error;
-       PB_TRACE(pb, "ioerror", (unsigned long)error);
-}
-
-/*
- *     pagebuf_iostart
- *
- *     pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
- *     If necessary, it will arrange for any disk space allocation required,
- *     and it will break up the request if the block mappings require it.
- *     The pb_iodone routine in the buffer supplied will only be called
- *     when all of the subsidiary I/O requests, if any, have been completed.
- *     pagebuf_iostart calls the pagebuf_ioinitiate routine or
- *     pagebuf_iorequest, if the former routine is not defined, to start
- *     the I/O on a given low-level request.
- */
-int
-pagebuf_iostart(                       /* start I/O on a buffer          */
-       page_buf_t              *pb,    /* buffer to start                */
-       page_buf_flags_t        flags)  /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
-                                       /* PBF_WRITE, PBF_DELWRI,         */
-                                       /* PBF_SYNC, PBF_DONT_BLOCK       */
-{
-       int                     status = 0;
-
-       PB_TRACE(pb, "iostart", (unsigned long)flags);
-
-       if (flags & PBF_DELWRI) {
-               pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
-               pb->pb_flags |= flags &
-                               (PBF_DELWRI | PBF_ASYNC | PBF_SYNC);
-               pagebuf_delwri_queue(pb, 1);
-               return status;
-       }
-
-       pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | \
-                       PBF_DELWRI | PBF_READ_AHEAD | PBF_RUN_QUEUES);
-       pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \
-                       PBF_SYNC | PBF_READ_AHEAD | PBF_RUN_QUEUES);
-
-       BUG_ON(pb->pb_bn == PAGE_BUF_DADDR_NULL);
-
-       /* For writes allow an alternate strategy routine to precede
-        * the actual I/O request (which may not be issued at all in
-        * a shutdown situation, for example).
-        */
-       status = (flags & PBF_WRITE) ?
-               pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
-
-       /* Wait for I/O if we are not an async request.
-        * Note: async I/O request completion will release the buffer,
-        * and that can already be done by this point.  So using the
-        * buffer pointer from here on, after async I/O, is invalid.
-        */
-       if (!status && !(flags & PBF_ASYNC))
-               status = pagebuf_iowait(pb);
-
-       return status;
-}
-
-/*
- * Helper routine for pagebuf_iorequest
- */
-
-STATIC __inline__ int
-_pagebuf_iolocked(
-       page_buf_t              *pb)
-{
-       ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
-       if (pb->pb_flags & PBF_READ)
-               return pb->pb_locked;
-       return ((pb->pb_flags & _PBF_LOCKABLE) == 0);
-}
-
-STATIC __inline__ void
-_pagebuf_iodone(
-       page_buf_t              *pb,
-       int                     schedule)
-{
-       if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
-               pb->pb_locked = 0;
-               pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
-       }
-}
-
-STATIC int
-bio_end_io_pagebuf(
-       struct bio              *bio,
-       unsigned int            bytes_done,
-       int                     error)
-{
-       page_buf_t              *pb = (page_buf_t *)bio->bi_private;
-       unsigned int            i, blocksize = pb->pb_target->pbr_bsize;
-       unsigned int            sectorshift = pb->pb_target->pbr_sshift;
-       struct bio_vec          *bvec = bio->bi_io_vec;
-
-       if (bio->bi_size)
-               return 1;
-
-       if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-               pb->pb_error = EIO;
-
-       for (i = 0; i < bio->bi_vcnt; i++, bvec++) {
-               struct page     *page = bvec->bv_page;
-
-               if (pb->pb_error) {
-                       SetPageError(page);
-               } else if (blocksize == PAGE_CACHE_SIZE) {
-                       SetPageUptodate(page);
-               } else if (!PagePrivate(page)) {
-                       unsigned int    j, range;
-
-                       ASSERT(blocksize < PAGE_CACHE_SIZE);
-                       range = (bvec->bv_offset + bvec->bv_len) >> sectorshift;
-                       for (j = bvec->bv_offset >> sectorshift; j < range; j++)
-                               set_bit(j, &page->private);
-                       if (page->private == (unsigned long)(PAGE_CACHE_SIZE-1))
-                               SetPageUptodate(page);
-               }
-
-               if (_pagebuf_iolocked(pb)) {
-                       unlock_page(page);
-               }
-       }
-
-       _pagebuf_iodone(pb, 1);
-       bio_put(bio);
-       return 0;
-}
-
-void
-_pagebuf_ioapply(
-       page_buf_t              *pb)
-{
-       int                     i, map_i, total_nr_pages, nr_pages;
-       struct bio              *bio;
-       int                     offset = pb->pb_offset;
-       int                     size = pb->pb_count_desired;
-       sector_t                sector = pb->pb_bn;
-       unsigned int            blocksize = pb->pb_target->pbr_bsize;
-       int                     locking = _pagebuf_iolocked(pb);
-
-       total_nr_pages = pb->pb_page_count;
-       map_i = 0;
-
-       /* Special code path for reading a sub page size pagebuf in --
-        * we populate up the whole page, and hence the other metadata
-        * in the same page.  This optimization is only valid when the
-        * filesystem block size and the page size are equal.
-        */
-       if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
-           (pb->pb_flags & PBF_READ) && locking &&
-           (blocksize == PAGE_CACHE_SIZE)) {
-               bio = bio_alloc(GFP_NOIO, 1);
-
-               bio->bi_bdev = pb->pb_target->pbr_bdev;
-               bio->bi_sector = sector - (offset >> BBSHIFT);
-               bio->bi_end_io = bio_end_io_pagebuf;
-               bio->bi_private = pb;
-
-               bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0);
-               size = 0;
-
-               atomic_inc(&pb->pb_io_remaining);
-
-               goto submit_io;
-       }
-
-       /* Lock down the pages which we need to for the request */
-       if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) {
-               for (i = 0; size; i++) {
-                       int             nbytes = PAGE_CACHE_SIZE - offset;
-                       struct page     *page = pb->pb_pages[i];
-
-                       if (nbytes > size)
-                               nbytes = size;
-
-                       lock_page(page);
-
-                       size -= nbytes;
-                       offset = 0;
-               }
-               offset = pb->pb_offset;
-               size = pb->pb_count_desired;
-       }
-
-next_chunk:
-       atomic_inc(&pb->pb_io_remaining);
-       nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
-       if (nr_pages > total_nr_pages)
-               nr_pages = total_nr_pages;
-
-       bio = bio_alloc(GFP_NOIO, nr_pages);
-       bio->bi_bdev = pb->pb_target->pbr_bdev;
-       bio->bi_sector = sector;
-       bio->bi_end_io = bio_end_io_pagebuf;
-       bio->bi_private = pb;
-
-       for (; size && nr_pages; nr_pages--, map_i++) {
-               int     nbytes = PAGE_CACHE_SIZE - offset;
-
-               if (nbytes > size)
-                       nbytes = size;
-
-               if (bio_add_page(bio, pb->pb_pages[map_i],
-                                       nbytes, offset) < nbytes)
-                       break;
-
-               offset = 0;
-               sector += nbytes >> BBSHIFT;
-               size -= nbytes;
-               total_nr_pages--;
-       }
-
-submit_io:
-       if (likely(bio->bi_size)) {
-               submit_bio((pb->pb_flags & PBF_READ) ? READ : WRITE, bio);
-               if (size)
-                       goto next_chunk;
-       } else {
-               bio_put(bio);
-               pagebuf_ioerror(pb, EIO);
-       }
-
-       if (pb->pb_flags & PBF_RUN_QUEUES) {
-               pb->pb_flags &= ~PBF_RUN_QUEUES;
-               if (atomic_read(&pb->pb_io_remaining) > 1)
-                       blk_run_queues();
-       }
-}
-
-/*
- *     pagebuf_iorequest
- *
- *     pagebuf_iorequest is the core I/O request routine.
- *     It assumes that the buffer is well-formed and
- *     mapped and ready for physical I/O, unlike
- *     pagebuf_iostart() and pagebuf_iophysio().  Those
- *     routines call the pagebuf_ioinitiate routine to start I/O,
- *     if it is present, or else call pagebuf_iorequest()
- *     directly if the pagebuf_ioinitiate routine is not present.
- *
- *     This function will be responsible for ensuring access to the
- *     pages is restricted whilst I/O is in progress - for locking
- *     pagebufs the pagebuf lock is the mediator, for non-locking
- *     pagebufs the pages will be locked. In the locking case we
- *     need to use the pagebuf lock as multiple meta-data buffers
- *     will reference the same page.
- */
-int
-pagebuf_iorequest(                     /* start real I/O               */
-       page_buf_t              *pb)    /* buffer to convey to device   */
-{
-       PB_TRACE(pb, "iorequest", 0);
-
-       if (pb->pb_flags & PBF_DELWRI) {
-               pagebuf_delwri_queue(pb, 1);
-               return 0;
-       }
-
-       if (pb->pb_flags & PBF_WRITE) {
-               _pagebuf_wait_unpin(pb);
-       }
-
-       pagebuf_hold(pb);
-
-       /* Set the count to 1 initially, this will stop an I/O
-        * completion callout which happens before we have started
-        * all the I/O from calling pagebuf_iodone too early.
-        */
-       atomic_set(&pb->pb_io_remaining, 1);
-       _pagebuf_ioapply(pb);
-       _pagebuf_iodone(pb, 0);
-
-       pagebuf_rele(pb);
-       return 0;
-}
-
-/*
- *     pagebuf_iowait
- *
- *     pagebuf_iowait waits for I/O to complete on the buffer supplied.
- *     It returns immediately if no I/O is pending.  In any case, it returns
- *     the error code, if any, or 0 if there is no error.
- */
-int
-pagebuf_iowait(
-       page_buf_t              *pb)
-{
-       PB_TRACE(pb, "iowait", 0);
-       if (atomic_read(&pb->pb_io_remaining))
-               blk_run_queues();
-       down(&pb->pb_iodonesema);
-       PB_TRACE(pb, "iowaited", (long)pb->pb_error);
-       return pb->pb_error;
-}
-
-STATIC void *
-pagebuf_mapout_locked(
-       page_buf_t              *pb)
-{
-       void                    *old_addr = NULL;
-
-       if (pb->pb_flags & PBF_MAPPED) {
-               if (pb->pb_flags & _PBF_ADDR_ALLOCATED)
-                       old_addr = pb->pb_addr - pb->pb_offset;
-               pb->pb_addr = NULL;
-               pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED);
-       }
-
-       return old_addr;        /* Caller must free the address space,
-                                * we are under a spin lock, probably
-                                * not safe to do vfree here
-                                */
-}
-
-caddr_t
-pagebuf_offset(
-       page_buf_t              *pb,
-       size_t                  offset)
-{
-       struct page             *page;
-
-       offset += pb->pb_offset;
-
-       page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
-       return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
-}
-
-/*
- *     pagebuf_iomove
- *
- *     Move data into or out of a buffer.
- */
-void
-pagebuf_iomove(
-       page_buf_t              *pb,    /* buffer to process            */
-       size_t                  boff,   /* starting buffer offset       */
-       size_t                  bsize,  /* length to copy               */
-       caddr_t                 data,   /* data address                 */
-       page_buf_rw_t           mode)   /* read/write flag              */
-{
-       size_t                  bend, cpoff, csize;
-       struct page             *page;
-
-       bend = boff + bsize;
-       while (boff < bend) {
-               page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
-               cpoff = page_buf_poff(boff + pb->pb_offset);
-               csize = min_t(size_t,
-                             PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
-
-               ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
-
-               switch (mode) {
-               case PBRW_ZERO:
-                       memset(page_address(page) + cpoff, 0, csize);
-                       break;
-               case PBRW_READ:
-                       memcpy(data, page_address(page) + cpoff, csize);
-                       break;
-               case PBRW_WRITE:
-                       memcpy(page_address(page) + cpoff, data, csize);
-               }
-
-               boff += csize;
-               data += csize;
-       }
-}
-
-
-/*
- * Pagebuf delayed write buffer handling
- */
-
-STATIC int pbd_active = 1;
-STATIC LIST_HEAD(pbd_delwrite_queue);
-STATIC spinlock_t pbd_delwrite_lock = SPIN_LOCK_UNLOCKED;
-
-STATIC void
-pagebuf_delwri_queue(
-       page_buf_t              *pb,
-       int                     unlock)
-{
-       PB_TRACE(pb, "delwri_q", (long)unlock);
-       spin_lock(&pbd_delwrite_lock);
-       /* If already in the queue, dequeue and place at tail */
-       if (!list_empty(&pb->pb_list)) {
-               if (unlock) {
-                       atomic_dec(&pb->pb_hold);
-               }
-               list_del(&pb->pb_list);
-       }
-
-       list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
-       pb->pb_flushtime = jiffies + pb_params.age_buffer.val;
-       spin_unlock(&pbd_delwrite_lock);
-
-       if (unlock && (pb->pb_flags & _PBF_LOCKABLE)) {
-               pagebuf_unlock(pb);
-       }
-}
-
-void
-pagebuf_delwri_dequeue(
-       page_buf_t              *pb)
-{
-       PB_TRACE(pb, "delwri_uq", 0);
-       spin_lock(&pbd_delwrite_lock);
-       list_del_init(&pb->pb_list);
-       pb->pb_flags &= ~PBF_DELWRI;
-       spin_unlock(&pbd_delwrite_lock);
-}
-
-STATIC void
-pagebuf_runall_queues(
-       struct workqueue_struct *queue)
-{
-       flush_workqueue(queue);
-}
-
-/* Defines for pagebuf daemon */
-DECLARE_WAIT_QUEUE_HEAD(pbd_waitq);
-STATIC int force_flush;
-
-STATIC void
-pagebuf_daemon_wakeup(
-       int                     flag)
-{
-       force_flush = flag;
-       if (waitqueue_active(&pbd_waitq)) {
-               wake_up_interruptible(&pbd_waitq);
-       }
-}
-
-typedef void (*timeout_fn)(unsigned long);
-
-STATIC int
-pagebuf_daemon(
-       void                    *data)
-{
-       int                     count;
-       page_buf_t              *pb;
-       struct list_head        *curr, *next, tmp;
-       struct timer_list       pb_daemon_timer =
-               TIMER_INITIALIZER((timeout_fn)pagebuf_daemon_wakeup, 0, 0);
-
-       /*  Set up the thread  */
-       daemonize("pagebufd");
-
-       current->flags |= PF_MEMALLOC;
-
-       INIT_LIST_HEAD(&tmp);
-       do {
-               /* swsusp */
-               if (current->flags & PF_FREEZE)
-                       refrigerator(PF_IOTHREAD);
-
-               if (pbd_active == 1) {
-                       mod_timer(&pb_daemon_timer,
-                                 jiffies + pb_params.flush_interval.val);
-                       interruptible_sleep_on(&pbd_waitq);
-               }
-
-               if (pbd_active == 0) {
-                       del_timer_sync(&pb_daemon_timer);
-               }
-
-               spin_lock(&pbd_delwrite_lock);
-
-               count = 0;
-               list_for_each_safe(curr, next, &pbd_delwrite_queue) {
-                       pb = list_entry(curr, page_buf_t, pb_list);
-
-                       PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
-
-                       if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) &&
-                           (((pb->pb_flags & _PBF_LOCKABLE) == 0) ||
-                            !pagebuf_cond_lock(pb))) {
-
-                               if (!force_flush &&
-                                   time_before(jiffies, pb->pb_flushtime)) {
-                                       pagebuf_unlock(pb);
-                                       break;
-                               }
-
-                               pb->pb_flags &= ~PBF_DELWRI;
-                               pb->pb_flags |= PBF_WRITE;
-
-                               list_del(&pb->pb_list);
-                               list_add(&pb->pb_list, &tmp);
-
-                               count++;
-                       }
-               }
-
-               spin_unlock(&pbd_delwrite_lock);
-               while (!list_empty(&tmp)) {
-                       pb = list_entry(tmp.next, page_buf_t, pb_list);
-                       list_del_init(&pb->pb_list);
-
-                       pagebuf_iostrategy(pb);
-               }
-
-               if (as_list_len > 0)
-                       purge_addresses();
-               if (count)
-                       blk_run_queues();
-
-               force_flush = 0;
-       } while (pbd_active == 1);
-
-       pbd_active = -1;
-       wake_up_interruptible(&pbd_waitq);
-
-       return 0;
-}
-
-void
-pagebuf_delwri_flush(
-       pb_target_t             *target,
-       u_long                  flags,
-       int                     *pinptr)
-{
-       page_buf_t              *pb;
-       struct list_head        *curr, *next, tmp;
-       int                     pincount = 0;
-       int                     flush_cnt = 0;
-
-       pagebuf_runall_queues(pagebuf_dataio_workqueue);
-       pagebuf_runall_queues(pagebuf_logio_workqueue);
-
-       spin_lock(&pbd_delwrite_lock);
-       INIT_LIST_HEAD(&tmp);
-
-       list_for_each_safe(curr, next, &pbd_delwrite_queue) {
-               pb = list_entry(curr, page_buf_t, pb_list);
-
-               /*
-                * Skip other targets, markers and in progress buffers
-                */
-
-               if ((pb->pb_flags == 0) || (pb->pb_target != target) ||
-                   !(pb->pb_flags & PBF_DELWRI)) {
-                       continue;
-               }
-
-               PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
-               if (pagebuf_ispin(pb)) {
-                       pincount++;
-                       continue;
-               }
-
-               pb->pb_flags &= ~PBF_DELWRI;
-               pb->pb_flags |= PBF_WRITE;
-               list_move(&pb->pb_list, &tmp);
-       }
-       /* ok found all the items that can be worked on 
-        * drop the lock and process the private list */
-       spin_unlock(&pbd_delwrite_lock);
-
-       list_for_each_safe(curr, next, &tmp) {
-               pb = list_entry(curr, page_buf_t, pb_list);
-
-               if (flags & PBDF_WAIT)
-                       pb->pb_flags &= ~PBF_ASYNC;
-               else
-                       list_del_init(curr);
-
-               pagebuf_lock(pb);
-               pagebuf_iostrategy(pb);
-               if (++flush_cnt > 32) {
-                       blk_run_queues();
-                       flush_cnt = 0;
-               }
-       }
-
-       blk_run_queues();
-
-       while (!list_empty(&tmp)) {
-               pb = list_entry(tmp.next, page_buf_t, pb_list);
-
-               list_del_init(&pb->pb_list);
-               pagebuf_iowait(pb);
-               if (!pb->pb_relse)
-                       pagebuf_unlock(pb);
-               pagebuf_rele(pb);
-       }
-
-       if (pinptr)
-               *pinptr = pincount;
-}
-
-STATIC int
-pagebuf_daemon_start(void)
-{
-       int             rval;
-
-       pagebuf_logio_workqueue = create_workqueue("xfslogd");
-       if (!pagebuf_logio_workqueue)
-               return -ENOMEM;
-
-       pagebuf_dataio_workqueue = create_workqueue("xfsdatad");
-       if (!pagebuf_dataio_workqueue) {
-               destroy_workqueue(pagebuf_logio_workqueue);
-               return -ENOMEM;
-       }
-
-       rval = kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES);
-       if (rval < 0) {
-               destroy_workqueue(pagebuf_logio_workqueue);
-               destroy_workqueue(pagebuf_dataio_workqueue);
-       }
-
-       return rval;
-}
-
-/*
- * pagebuf_daemon_stop
- *
- * Note: do not mark as __exit, it is called from pagebuf_terminate.
- */
-STATIC void
-pagebuf_daemon_stop(void)
-{
-       pbd_active = 0;
-       wake_up_interruptible(&pbd_waitq);
-       wait_event_interruptible(pbd_waitq, pbd_active);
-       destroy_workqueue(pagebuf_logio_workqueue);
-       destroy_workqueue(pagebuf_dataio_workqueue);
-}
-
-
-/*
- * Pagebuf sysctl interface
- */
-
-STATIC int
-pb_stats_clear_handler(
-       ctl_table               *ctl,
-       int                     write,
-       struct file             *filp,
-       void                    *buffer,
-       size_t                  *lenp)
-{
-       int                     c, ret;
-       int                     *valp = ctl->data;
-
-       ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp);
-
-       if (!ret && write && *valp) {
-               printk("XFS Clearing pbstats\n");
-               for (c = 0; c < NR_CPUS; c++) {
-                       if (!cpu_possible(c)) continue;
-                               memset(&per_cpu(pbstats, c), 0,
-                                      sizeof(struct pbstats));
-               }
-               pb_params.stats_clear.val = 0;
-       }
-
-       return ret;
-}
-
-STATIC struct ctl_table_header *pagebuf_table_header;
-
-STATIC ctl_table pagebuf_table[] = {
-       {PB_FLUSH_INT, "flush_int", &pb_params.flush_interval.val,
-       sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-       &sysctl_intvec, NULL,
-       &pb_params.flush_interval.min, &pb_params.flush_interval.max},
-
-       {PB_FLUSH_AGE, "flush_age", &pb_params.age_buffer.val,
-       sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-       &sysctl_intvec, NULL, 
-       &pb_params.age_buffer.min, &pb_params.age_buffer.max},
-
-       {PB_STATS_CLEAR, "stats_clear", &pb_params.stats_clear.val,
-       sizeof(int), 0644, NULL, &pb_stats_clear_handler,
-       &sysctl_intvec, NULL, 
-       &pb_params.stats_clear.min, &pb_params.stats_clear.max},
-
-#ifdef PAGEBUF_TRACE
-       {PB_DEBUG, "debug", &pb_params.debug.val,
-       sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-       &sysctl_intvec, NULL, 
-       &pb_params.debug.min, &pb_params.debug.max},
-#endif
-       {0}
-};
-
-STATIC ctl_table pagebuf_dir_table[] = {
-       {VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table},
-       {0}
-};
-
-STATIC ctl_table pagebuf_root_table[] = {
-       {CTL_VM, "vm",  NULL, 0, 0555, pagebuf_dir_table},
-       {0}
-};
-
-#ifdef CONFIG_PROC_FS
-STATIC int
-pagebuf_readstats(
-       char                    *buffer,
-       char                    **start,
-       off_t                   offset,
-       int                     count,
-       int                     *eof,
-       void                    *data)
-{
-       int                     c, i, len, val;
-
-       len = 0;
-       len += sprintf(buffer + len, "pagebuf");
-       for (i = 0; i < sizeof(struct pbstats) / sizeof(u_int32_t); i++) {
-               val = 0;
-               for (c = 0 ; c < NR_CPUS; c++) {
-                       if (!cpu_possible(c)) continue;
-                       val += *(((u_int32_t*)&per_cpu(pbstats, c) + i));
-               }
-               len += sprintf(buffer + len, " %u", val);
-       }
-       buffer[len++] = '\n';
-
-       if (offset >= len) {
-               *start = buffer;
-               *eof = 1;
-               return 0;
-       }
-       *start = buffer + offset;
-       if ((len -= offset) > count)
-               return count;
-       *eof = 1;
-
-       return len;
-}
-#endif  /* CONFIG_PROC_FS */
-
-/*
- *     Initialization and Termination
- */
-
-int __init
-pagebuf_init(void)
-{
-       int                     i;
-
-       pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1);
-
-#ifdef CONFIG_PROC_FS
-       if (proc_mkdir("fs/pagebuf", 0))
-               create_proc_read_entry(
-                       "fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL);
-#endif
-
-       pagebuf_cache = kmem_cache_create("page_buf_t", sizeof(page_buf_t), 0,
-                       SLAB_HWCACHE_ALIGN, NULL, NULL);
-       if (pagebuf_cache == NULL) {
-               printk("pagebuf: couldn't init pagebuf cache\n");
-               pagebuf_terminate();
-               return -ENOMEM;
-       }
-
-       for (i = 0; i < NHASH; i++) {
-               spin_lock_init(&pbhash[i].pb_hash_lock);
-               INIT_LIST_HEAD(&pbhash[i].pb_hash);
-       }
-
-#ifdef PAGEBUF_TRACE
-       pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
-#endif
-
-       pagebuf_daemon_start();
-       return 0;
-}
-
-
-/*
- *     pagebuf_terminate.
- *
- *     Note: do not mark as __exit, this is also called from the __init code.
- */
-void
-pagebuf_terminate(void)
-{
-       pagebuf_daemon_stop();
-
-       kmem_cache_destroy(pagebuf_cache);
-
-       unregister_sysctl_table(pagebuf_table_header);
-#ifdef  CONFIG_PROC_FS
-       remove_proc_entry("fs/pagebuf/stat", NULL);
-       remove_proc_entry("fs/pagebuf", NULL);
-#endif
-}
-
-
-/*
- *     Module management (for kernel debugger module)
- */
-EXPORT_SYMBOL(pagebuf_offset);
-#ifdef DEBUG
-EXPORT_SYMBOL(pbd_delwrite_queue);
-#endif
diff --git a/fs/xfs/pagebuf/page_buf.h b/fs/xfs/pagebuf/page_buf.h
deleted file mode 100644 (file)
index ef7895f..0000000
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-
-/*
- * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI
- */
-
-#ifndef __PAGE_BUF_H__
-#define __PAGE_BUF_H__
-
-#include <linux/config.h>
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <asm/system.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/uio.h>
-
-/*
- *     Base types
- */
-
-/* daddr must be signed since -1 is used for bmaps that are not yet allocated */
-typedef loff_t page_buf_daddr_t;
-
-#define PAGE_BUF_DADDR_NULL ((page_buf_daddr_t) (-1LL))
-
-#define page_buf_ctob(pp)      ((pp) * PAGE_CACHE_SIZE)
-#define page_buf_btoc(dd)      (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT)
-#define page_buf_btoct(dd)     ((dd) >> PAGE_CACHE_SHIFT)
-#define page_buf_poff(aa)      ((aa) & ~PAGE_CACHE_MASK)
-
-typedef enum page_buf_rw_e {
-       PBRW_READ = 1,                  /* transfer into target memory */
-       PBRW_WRITE = 2,                 /* transfer from target memory */
-       PBRW_ZERO = 3                   /* Zero target memory */
-} page_buf_rw_t;
-
-
-typedef enum page_buf_flags_e {                /* pb_flags values */
-       PBF_READ = (1 << 0),    /* buffer intended for reading from device */
-       PBF_WRITE = (1 << 1),   /* buffer intended for writing to device   */
-       PBF_MAPPED = (1 << 2),  /* buffer mapped (pb_addr valid)           */
-       PBF_PARTIAL = (1 << 3), /* buffer partially read                   */
-       PBF_ASYNC = (1 << 4),   /* initiator will not wait for completion  */
-       PBF_NONE = (1 << 5),    /* buffer not read at all                  */
-       PBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
-       PBF_FREED = (1 << 7),   /* buffer has been freed and is invalid    */
-       PBF_SYNC = (1 << 8),    /* force updates to disk                   */
-       PBF_MAPPABLE = (1 << 9),/* use directly-addressable pages          */
-       PBF_STALE = (1 << 10),  /* buffer has been staled, do not find it  */
-       PBF_FS_MANAGED = (1 << 11), /* filesystem controls freeing memory  */
-       PBF_FS_DATAIOD = (1 << 12), /* schedule IO completion on fs datad  */
-
-       /* flags used only as arguments to access routines */
-       PBF_LOCK = (1 << 13),   /* lock requested                          */
-       PBF_TRYLOCK = (1 << 14), /* lock requested, but do not wait        */
-       PBF_DONT_BLOCK = (1 << 15), /* do not block in current thread      */
-
-       /* flags used only internally */
-       _PBF_LOCKABLE = (1 << 16), /* page_buf_t may be locked             */
-       _PBF_PRIVATE_BH = (1 << 17), /* do not use public buffer heads     */
-       _PBF_ALL_PAGES_MAPPED = (1 << 18), /* all pages in range mapped    */
-       _PBF_ADDR_ALLOCATED = (1 << 19), /* pb_addr space was allocated    */
-       _PBF_MEM_ALLOCATED = (1 << 20), /* underlying pages are allocated  */
-       _PBF_MEM_SLAB = (1 << 21), /* underlying pages are slab allocated  */
-
-       PBF_FORCEIO = (1 << 22), /* ignore any cache state                 */
-       PBF_FLUSH = (1 << 23),  /* flush disk write cache                  */
-       PBF_READ_AHEAD = (1 << 24), /* asynchronous read-ahead             */
-       PBF_RUN_QUEUES = (1 << 25), /* run block device task queue         */
-
-} page_buf_flags_t;
-
-#define PBF_UPDATE (PBF_READ | PBF_WRITE)
-#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0)
-#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0)
-
-typedef struct pb_target {
-       dev_t                   pbr_dev;
-       struct block_device     *pbr_bdev;
-       struct address_space    *pbr_mapping;
-       unsigned int            pbr_bsize;
-       unsigned int            pbr_sshift;
-       size_t                  pbr_smask;
-} pb_target_t;
-
-/*
- *     page_buf_t:  Buffer structure for page cache-based buffers
- *
- * This buffer structure is used by the page cache buffer management routines
- * to refer to an assembly of pages forming a logical buffer.  The actual
- * I/O is performed with buffer_head or bio structures, as required by drivers,
- * for drivers which do not understand this structure.  The buffer structure is
- * used on temporary basis only, and discarded when released.
- *
- * The real data storage is recorded in the page cache.  Metadata is
- * hashed to the inode for the block device on which the file system resides.
- * File data is hashed to the inode for the file.  Pages which are only
- * partially filled with data have bits set in their block_map entry
- * to indicate which disk blocks in the page are not valid.
- */
-
-struct page_buf_s;
-typedef void (*page_buf_iodone_t)(struct page_buf_s *);
-                       /* call-back function on I/O completion */
-typedef void (*page_buf_relse_t)(struct page_buf_s *);
-                       /* call-back function on I/O completion */
-typedef int (*page_buf_bdstrat_t)(struct page_buf_s *);
-
-#define PB_PAGES       4
-
-typedef struct page_buf_s {
-       struct semaphore        pb_sema;        /* semaphore for lockables  */
-       unsigned long           pb_flushtime;   /* time to flush pagebuf    */
-       atomic_t                pb_pin_count;   /* pin count                */
-       wait_queue_head_t       pb_waiters;     /* unpin waiters            */
-       struct list_head        pb_list;
-       page_buf_flags_t        pb_flags;       /* status flags */
-       struct list_head        pb_hash_list;
-       struct pb_target        *pb_target;     /* logical object */
-       atomic_t                pb_hold;        /* reference count */
-       page_buf_daddr_t        pb_bn;          /* block number for I/O */
-       loff_t                  pb_file_offset; /* offset in file */
-       size_t                  pb_buffer_length; /* size of buffer in bytes */
-       size_t                  pb_count_desired; /* desired transfer size */
-       void                    *pb_addr;       /* virtual address of buffer */
-       struct work_struct      pb_iodone_work;
-       atomic_t                pb_io_remaining;/* #outstanding I/O requests */
-       page_buf_iodone_t       pb_iodone;      /* I/O completion function */
-       page_buf_relse_t        pb_relse;       /* releasing function */
-       page_buf_bdstrat_t      pb_strat;       /* pre-write function */
-       struct semaphore        pb_iodonesema;  /* Semaphore for I/O waiters */
-       void                    *pb_fspriv;
-       void                    *pb_fspriv2;
-       void                    *pb_fspriv3;
-       unsigned short          pb_error;       /* error code on I/O */
-       unsigned short          pb_page_count;  /* size of page array */
-       unsigned short          pb_offset;      /* page offset in first page */
-       unsigned char           pb_locked;      /* page array is locked */
-       unsigned char           pb_hash_index;  /* hash table index     */
-       struct page             **pb_pages;     /* array of page pointers */
-       struct page             *pb_page_array[PB_PAGES]; /* inline pages */
-#ifdef PAGEBUF_LOCK_TRACKING
-       int                     pb_last_holder;
-#endif
-} page_buf_t;
-
-
-/* Finding and Reading Buffers */
-
-extern page_buf_t *pagebuf_find(       /* find buffer for block if     */
-                                       /* the block is in memory       */
-               struct pb_target *,     /* inode for block              */
-               loff_t,                 /* starting offset of range     */
-               size_t,                 /* length of range              */
-               page_buf_flags_t);      /* PBF_LOCK                     */
-
-extern page_buf_t *pagebuf_get(                /* allocate a buffer            */
-               struct pb_target *,     /* inode for buffer             */
-               loff_t,                 /* starting offset of range     */
-               size_t,                 /* length of range              */
-               page_buf_flags_t);      /* PBF_LOCK, PBF_READ,          */
-                                       /* PBF_ASYNC                    */
-
-extern page_buf_t *pagebuf_lookup(
-               struct pb_target *,
-               loff_t,                 /* starting offset of range     */
-               size_t,                 /* length of range              */
-               page_buf_flags_t);      /* PBF_READ, PBF_WRITE,         */
-                                       /* PBF_FORCEIO, _PBF_LOCKABLE   */
-
-extern page_buf_t *pagebuf_get_empty(  /* allocate pagebuf struct with */
-                                       /*  no memory or disk address   */
-               size_t len,
-               struct pb_target *);    /* mount point "fake" inode     */
-
-extern page_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct     */
-                                       /* without disk address         */
-               size_t len,
-               struct pb_target *);    /* mount point "fake" inode     */
-
-extern int pagebuf_associate_memory(
-               page_buf_t *,
-               void *,
-               size_t);
-
-extern void pagebuf_hold(              /* increment reference count    */
-               page_buf_t *);          /* buffer to hold               */
-
-extern void pagebuf_readahead(         /* read ahead into cache        */
-               struct pb_target  *,    /* target for buffer (or NULL)  */
-               loff_t,                 /* starting offset of range     */
-               size_t,                 /* length of range              */
-               page_buf_flags_t);      /* additional read flags        */
-
-/* Releasing Buffers */
-
-extern void pagebuf_free(              /* deallocate a buffer          */
-               page_buf_t *);          /* buffer to deallocate         */
-
-extern void pagebuf_rele(              /* release hold on a buffer     */
-               page_buf_t *);          /* buffer to release            */
-
-/* Locking and Unlocking Buffers */
-
-extern int pagebuf_cond_lock(          /* lock buffer, if not locked   */
-                                       /* (returns -EBUSY if locked)   */
-               page_buf_t *);          /* buffer to lock               */
-
-extern int pagebuf_lock_value(         /* return count on lock         */
-               page_buf_t *);          /* buffer to check              */
-
-extern int pagebuf_lock(               /* lock buffer                  */
-               page_buf_t *);          /* buffer to lock               */
-
-extern void pagebuf_unlock(            /* unlock buffer                */
-               page_buf_t *);          /* buffer to unlock             */
-
-/* Buffer Read and Write Routines */
-
-extern void pagebuf_iodone(            /* mark buffer I/O complete     */
-               page_buf_t *,           /* buffer to mark               */
-               int,                    /* use data/log helper thread.  */
-               int);                   /* run completion locally, or in
-                                        * a helper thread.             */
-
-extern void pagebuf_ioerror(           /* mark buffer in error (or not) */
-               page_buf_t *,           /* buffer to mark               */
-               unsigned int);          /* error to store (0 if none)   */
-
-extern int pagebuf_iostart(            /* start I/O on a buffer        */
-               page_buf_t *,           /* buffer to start              */
-               page_buf_flags_t);      /* PBF_LOCK, PBF_ASYNC,         */
-                                       /* PBF_READ, PBF_WRITE,         */
-                                       /* PBF_DELWRI, PBF_SYNC         */
-
-extern int pagebuf_iorequest(          /* start real I/O               */
-               page_buf_t *);          /* buffer to convey to device   */
-
-extern int pagebuf_iowait(             /* wait for buffer I/O done     */
-               page_buf_t *);          /* buffer to wait on            */
-
-extern void pagebuf_iomove(            /* move data in/out of pagebuf  */
-               page_buf_t *,           /* buffer to manipulate         */
-               size_t,                 /* starting buffer offset       */
-               size_t,                 /* length in buffer             */
-               caddr_t,                /* data pointer                 */
-               page_buf_rw_t);         /* direction                    */
-
-static inline int pagebuf_iostrategy(page_buf_t *pb)
-{
-       return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb);
-}
-
-static inline int pagebuf_geterror(page_buf_t *pb)
-{
-       return pb ? pb->pb_error : ENOMEM;
-}
-
-/* Buffer Utility Routines */
-
-extern caddr_t pagebuf_offset(         /* pointer at offset in buffer  */
-               page_buf_t *,           /* buffer to offset into        */
-               size_t);                /* offset                       */
-
-/* Pinning Buffer Storage in Memory */
-
-extern void pagebuf_pin(               /* pin buffer in memory         */
-               page_buf_t *);          /* buffer to pin                */
-
-extern void pagebuf_unpin(             /* unpin buffered data          */
-               page_buf_t *);          /* buffer to unpin              */
-
-extern int pagebuf_ispin(              /* check if buffer is pinned    */
-               page_buf_t *);          /* buffer to check              */
-
-/* Delayed Write Buffer Routines */
-
-#define PBDF_WAIT    0x01
-extern void pagebuf_delwri_flush(
-               pb_target_t *,
-               unsigned long,
-               int *);
-
-extern void pagebuf_delwri_dequeue(
-               page_buf_t *);
-
-/* Buffer Daemon Setup Routines */
-
-extern int pagebuf_init(void);
-extern void pagebuf_terminate(void);
-
-
-#ifdef PAGEBUF_TRACE
-extern ktrace_t *pagebuf_trace_buf;
-extern void pagebuf_trace(
-               page_buf_t *,           /* buffer being traced          */
-               char *,                 /* description of operation     */
-               void *,                 /* arbitrary diagnostic value   */
-               void *);                /* return address               */
-#else
-# define pagebuf_trace(pb, id, ptr, ra)        do { } while (0)
-#endif
-
-#define pagebuf_target_name(target)    \
-       ({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; })
-
-#endif /* __PAGE_BUF_H__ */
diff --git a/fs/xfs/support/kmem.h b/fs/xfs/support/kmem.h
deleted file mode 100644 (file)
index a8fb09f..0000000
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_KMEM_H__
-#define __XFS_SUPPORT_KMEM_H__
-
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-/*
- * Cutoff point to use vmalloc instead of kmalloc.
- */
-#define MAX_SLAB_SIZE  0x10000
-
-/*
- * XFS uses slightly different names for these due to the
- * IRIX heritage.
- */
-#define        kmem_zone       kmem_cache_s
-#define kmem_zone_t    kmem_cache_t
-
-#define KM_SLEEP       0x0001
-#define KM_NOSLEEP     0x0002
-#define KM_NOFS                0x0004
-
-typedef unsigned long xfs_pflags_t;
-
-#define PFLAGS_TEST_FSTRANS()          (current->flags & PF_FSTRANS)
-
-#define PFLAGS_SET_FSTRANS(STATEP) do {        \
-       *(STATEP) = current->flags;     \
-       current->flags |= PF_FSTRANS;   \
-} while (0)
-
-#define PFLAGS_RESTORE(STATEP) do {    \
-       current->flags = *(STATEP);     \
-} while (0)
-
-#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \
-       *(NSTATEP) = *(OSTATEP);        \
-} while (0)
-
-/*
- * XXX get rid of the unconditional  __GFP_NOFAIL by adding
- * a KM_FAIL flag and using it where we're allowed to fail.
- */
-static __inline unsigned int
-kmem_flags_convert(int flags)
-{
-       int lflags;
-
-#if DEBUG
-       if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS))) {
-               printk(KERN_WARNING
-                   "XFS: memory allocation with wrong flags (%x)\n", flags);
-               BUG();
-       }
-#endif
-
-       lflags = (flags & KM_NOSLEEP) ? GFP_ATOMIC : (GFP_KERNEL|__GFP_NOFAIL);
-
-       /* avoid recusive callbacks to filesystem during transactions */
-       if (PFLAGS_TEST_FSTRANS())
-               lflags &= ~__GFP_FS;
-
-       return lflags;
-}
-
-static __inline void *
-kmem_alloc(size_t size, int flags)
-{
-       if (unlikely(MAX_SLAB_SIZE < size))
-               /* Avoid doing filesystem sensitive stuff to get this */
-               return __vmalloc(size, kmem_flags_convert(flags), PAGE_KERNEL);
-       return kmalloc(size, kmem_flags_convert(flags));
-}
-
-static __inline void *
-kmem_zalloc(size_t size, int flags)
-{
-       void *ptr = kmem_alloc(size, flags);
-       if (likely(ptr != NULL))
-               memset(ptr, 0, size);
-       return ptr;
-}
-
-static __inline void
-kmem_free(void *ptr, size_t size)
-{
-       if (unlikely((unsigned long)ptr < VMALLOC_START ||
-                    (unsigned long)ptr >= VMALLOC_END))
-               kfree(ptr);
-       else
-               vfree(ptr);
-}
-
-static __inline void *
-kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags)
-{
-       void *new = kmem_alloc(newsize, flags);
-
-       if (likely(ptr != NULL)) {
-               if (likely(new != NULL))
-                       memcpy(new, ptr, min(oldsize, newsize));
-               kmem_free(ptr, oldsize);
-       }
-
-       return new;
-}
-
-static __inline kmem_zone_t *
-kmem_zone_init(int size, char *zone_name)
-{
-       return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL);
-}
-
-static __inline void *
-kmem_zone_alloc(kmem_zone_t *zone, int flags)
-{
-       return kmem_cache_alloc(zone, kmem_flags_convert(flags));
-}
-
-static __inline void *
-kmem_zone_zalloc(kmem_zone_t *zone, int flags)
-{
-       void *ptr = kmem_zone_alloc(zone, flags);
-       if (likely(ptr != NULL))
-               memset(ptr, 0, kmem_cache_size(zone));
-       return ptr;
-}
-
-static __inline void
-kmem_zone_free(kmem_zone_t *zone, void *ptr)
-{
-       kmem_cache_free(zone, ptr);
-}
-
-typedef struct shrinker *kmem_shaker_t;
-typedef int (*kmem_shake_func_t)(int, unsigned int);
-
-static __inline kmem_shaker_t
-kmem_shake_register(kmem_shake_func_t sfunc)
-{
-       return set_shrinker(DEFAULT_SEEKS, sfunc);
-}
-
-static __inline void
-kmem_shake_deregister(kmem_shaker_t shrinker)
-{
-       remove_shrinker(shrinker);
-}
-
-static __inline int
-kmem_shake_allow(unsigned int gfp_mask)
-{
-       return (gfp_mask & __GFP_WAIT);
-}
-
-#endif /* __XFS_SUPPORT_KMEM_H__ */
index ad4033e146143c41b3c55af09b48469c187e808c..e776812da8bcd2c73a02a368ef564634f9ef89d9 100644 (file)
@@ -35,8 +35,8 @@
 #include <linux/slab.h>
 
 #include <xfs_types.h>
-#include "kmem.h"
-#include "spin.h"
+#include <kmem.h>
+#include <spin.h>
 #include "debug.h"
 #include "ktrace.h"
 
index b566ef8fa756353ecb7c832ad685462c95a8f481..92d1a1a5d04b643d021a05d40b1107201a6eb5fa 100644 (file)
@@ -32,7 +32,7 @@
 #ifndef __XFS_SUPPORT_KTRACE_H__
 #define __XFS_SUPPORT_KTRACE_H__
 
-#include <support/spin.h>
+#include <spin.h>
 
 /*
  * Trace buffer entry structure.
diff --git a/fs/xfs/support/mrlock.c b/fs/xfs/support/mrlock.c
deleted file mode 100644 (file)
index 5b5dae9..0000000
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-
-#include <linux/time.h>
-#include <linux/sched.h>
-#include <asm/system.h>
-#include <linux/interrupt.h>
-#include <asm/current.h>
-
-#include "mrlock.h"
-
-
-#if USE_RW_WAIT_QUEUE_SPINLOCK
-# define wq_write_lock write_lock
-#else
-# define wq_write_lock spin_lock
-#endif
-
-/*
- * We don't seem to need lock_type (only one supported), name, or
- * sequence. But, XFS will pass it so let's leave them here for now.
- */
-/* ARGSUSED */
-void
-mrlock_init(mrlock_t *mrp, int lock_type, char *name, long sequence)
-{
-       mrp->mr_count = 0;
-       mrp->mr_reads_waiting = 0;
-       mrp->mr_writes_waiting = 0;
-       init_waitqueue_head(&mrp->mr_readerq);
-       init_waitqueue_head(&mrp->mr_writerq);
-       mrp->mr_lock = SPIN_LOCK_UNLOCKED;
-}
-
-/*
- * Macros to lock/unlock the mrlock_t.
- */
-
-#define MRLOCK(m)              spin_lock(&(m)->mr_lock);
-#define MRUNLOCK(m)            spin_unlock(&(m)->mr_lock);
-
-
-/*
- * lock_wait should never be called in an interrupt thread.
- *
- * mrlocks can sleep (i.e. call schedule) and so they can't ever
- * be called from an interrupt thread.
- *
- * threads that wake-up should also never be invoked from interrupt threads.
- *
- * But, waitqueue_lock is locked from interrupt threads - and we are
- * called with interrupts disabled, so it is all OK.
- */
-
-/* ARGSUSED */
-void
-lock_wait(wait_queue_head_t *q, spinlock_t *lock, int rw)
-{
-       DECLARE_WAITQUEUE( wait, current );
-
-       __set_current_state(TASK_UNINTERRUPTIBLE);
-
-       spin_lock(&q->lock);
-       if (rw) {
-               __add_wait_queue_tail(q, &wait);
-       } else {
-               __add_wait_queue(q, &wait);
-       }
-
-       spin_unlock(&q->lock);
-       spin_unlock(lock);
-
-       schedule();
-
-       spin_lock(&q->lock);
-       __remove_wait_queue(q, &wait);
-       spin_unlock(&q->lock);
-
-       spin_lock(lock);
-
-       /* return with lock held */
-}
-
-/* ARGSUSED */
-void
-mrfree(mrlock_t *mrp)
-{
-}
-
-/* ARGSUSED */
-void
-mrlock(mrlock_t *mrp, int type, int flags)
-{
-       if (type == MR_ACCESS)
-               mraccess(mrp);
-       else
-               mrupdate(mrp);
-}
-
-/* ARGSUSED */
-void
-mraccessf(mrlock_t *mrp, int flags)
-{
-       MRLOCK(mrp);
-       if(mrp->mr_writes_waiting > 0) {
-               mrp->mr_reads_waiting++;
-               lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
-               mrp->mr_reads_waiting--;
-       }
-       while (mrp->mr_count < 0) {
-               mrp->mr_reads_waiting++;
-               lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
-               mrp->mr_reads_waiting--;
-       }
-       mrp->mr_count++;
-       MRUNLOCK(mrp);
-}
-
-/* ARGSUSED */
-void
-mrupdatef(mrlock_t *mrp, int flags)
-{
-       MRLOCK(mrp);
-       while(mrp->mr_count) {
-               mrp->mr_writes_waiting++;
-               lock_wait(&mrp->mr_writerq, &mrp->mr_lock, 1);
-               mrp->mr_writes_waiting--;
-       }
-
-       mrp->mr_count = -1; /* writer on it */
-       MRUNLOCK(mrp);
-}
-
-int
-mrtryaccess(mrlock_t *mrp)
-{
-       MRLOCK(mrp);
-       /*
-        * If anyone is waiting for update access or the lock is held for update
-        * fail the request.
-        */
-       if(mrp->mr_writes_waiting > 0 || mrp->mr_count < 0) {
-               MRUNLOCK(mrp);
-               return 0;
-       }
-       mrp->mr_count++;
-       MRUNLOCK(mrp);
-       return 1;
-}
-
-int
-mrtrypromote(mrlock_t *mrp)
-{
-       MRLOCK(mrp);
-
-       if(mrp->mr_count == 1) { /* We are the only thread with the lock */
-               mrp->mr_count = -1; /* writer on it */
-               MRUNLOCK(mrp);
-               return 1;
-       }
-
-       MRUNLOCK(mrp);
-       return 0;
-}
-
-int
-mrtryupdate(mrlock_t *mrp)
-{
-       MRLOCK(mrp);
-
-       if(mrp->mr_count) {
-               MRUNLOCK(mrp);
-               return 0;
-       }
-
-       mrp->mr_count = -1; /* writer on it */
-       MRUNLOCK(mrp);
-       return 1;
-}
-
-static __inline__ void mrwake(mrlock_t *mrp)
-{
-       /*
-        * First, if the count is now 0, we need to wake-up anyone waiting.
-        */
-       if (!mrp->mr_count) {
-               if (mrp->mr_writes_waiting) {   /* Wake-up first writer waiting */
-                       wake_up(&mrp->mr_writerq);
-               } else if (mrp->mr_reads_waiting) {     /* Wakeup any readers waiting */
-                       wake_up(&mrp->mr_readerq);
-               }
-       }
-}
-
-void
-mraccunlock(mrlock_t *mrp)
-{
-       MRLOCK(mrp);
-       mrp->mr_count--;
-       mrwake(mrp);
-       MRUNLOCK(mrp);
-}
-
-void
-mrunlock(mrlock_t *mrp)
-{
-       MRLOCK(mrp);
-       if (mrp->mr_count < 0) {
-               mrp->mr_count = 0;
-       } else {
-               mrp->mr_count--;
-       }
-       mrwake(mrp);
-       MRUNLOCK(mrp);
-}
-
-int
-ismrlocked(mrlock_t *mrp, int type)    /* No need to lock since info can change */
-{
-       if (type == MR_ACCESS)
-               return (mrp->mr_count > 0); /* Read lock */
-       else if (type == MR_UPDATE)
-               return (mrp->mr_count < 0); /* Write lock */
-       else if (type == (MR_UPDATE | MR_ACCESS))
-               return (mrp->mr_count); /* Any type of lock held */
-       else /* Any waiters */
-               return (mrp->mr_reads_waiting | mrp->mr_writes_waiting);
-}
-
-/*
- * Demote from update to access. We better be the only thread with the
- * lock in update mode so it should be easy to set to 1.
- * Wake-up any readers waiting.
- */
-
-void
-mrdemote(mrlock_t *mrp)
-{
-       MRLOCK(mrp);
-       mrp->mr_count = 1;
-       if (mrp->mr_reads_waiting) {    /* Wakeup all readers waiting */
-               wake_up(&mrp->mr_readerq);
-       }
-       MRUNLOCK(mrp);
-}
diff --git a/fs/xfs/support/mrlock.h b/fs/xfs/support/mrlock.h
deleted file mode 100644 (file)
index b2a7b3a..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_MRLOCK_H__
-#define __XFS_SUPPORT_MRLOCK_H__
-
-#include <linux/time.h>
-#include <linux/wait.h>
-#include <asm/atomic.h>
-#include <asm/semaphore.h>
-
-/*
- * Implement mrlocks on Linux that work for XFS.
- *
- * These are sleep locks and not spinlocks. If one wants read/write spinlocks,
- * use read_lock, write_lock, ... see spinlock.h.
- */
-
-typedef struct mrlock_s {
-       int                     mr_count;
-       unsigned short          mr_reads_waiting;
-       unsigned short          mr_writes_waiting;
-       wait_queue_head_t       mr_readerq;
-       wait_queue_head_t       mr_writerq;
-       spinlock_t              mr_lock;
-} mrlock_t;
-
-#define MR_ACCESS      1
-#define MR_UPDATE      2
-
-#define MRLOCK_BARRIER         0x1
-#define MRLOCK_ALLOW_EQUAL_PRI 0x8
-
-/*
- * mraccessf/mrupdatef take flags to be passed in while sleeping;
- * only PLTWAIT is currently supported.
- */
-
-extern void    mraccessf(mrlock_t *, int);
-extern void    mrupdatef(mrlock_t *, int);
-extern void     mrlock(mrlock_t *, int, int);
-extern void     mrunlock(mrlock_t *);
-extern void     mraccunlock(mrlock_t *);
-extern int      mrtryupdate(mrlock_t *);
-extern int      mrtryaccess(mrlock_t *);
-extern int     mrtrypromote(mrlock_t *);
-extern void     mrdemote(mrlock_t *);
-
-extern int     ismrlocked(mrlock_t *, int);
-extern void     mrlock_init(mrlock_t *, int type, char *name, long sequence);
-extern void     mrfree(mrlock_t *);
-
-#define mrinit(mrp, name)      mrlock_init(mrp, MRLOCK_BARRIER, name, -1)
-#define mraccess(mrp)          mraccessf(mrp, 0) /* grab for READ/ACCESS */
-#define mrupdate(mrp)          mrupdatef(mrp, 0) /* grab for WRITE/UPDATE */
-#define mrislocked_access(mrp) ((mrp)->mr_count > 0)
-#define mrislocked_update(mrp) ((mrp)->mr_count < 0)
-
-#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/support/mutex.h b/fs/xfs/support/mutex.h
deleted file mode 100644 (file)
index 0b296bb..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_MUTEX_H__
-#define __XFS_SUPPORT_MUTEX_H__
-
-#include <linux/spinlock.h>
-#include <asm/semaphore.h>
-
-/*
- * Map the mutex'es from IRIX to Linux semaphores.
- *
- * Destroy just simply initializes to -99 which should block all other
- * callers.
- */
-#define MUTEX_DEFAULT          0x0
-typedef struct semaphore       mutex_t;
-
-#define mutex_init(lock, type, name)           sema_init(lock, 1)
-#define mutex_destroy(lock)                    sema_init(lock, -99)
-#define mutex_lock(lock, num)                  down(lock)
-#define mutex_trylock(lock)                    (down_trylock(lock) ? 0 : 1)
-#define mutex_unlock(lock)                     up(lock)
-
-#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/support/sema.h b/fs/xfs/support/sema.h
deleted file mode 100644 (file)
index 30b67b4..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_SEMA_H__
-#define __XFS_SUPPORT_SEMA_H__
-
-#include <linux/time.h>
-#include <linux/wait.h>
-#include <asm/atomic.h>
-#include <asm/semaphore.h>
-
-/*
- * sema_t structure just maps to struct semaphore in Linux kernel.
- */
-
-typedef struct semaphore sema_t;
-
-#define init_sema(sp, val, c, d)       sema_init(sp, val)
-#define initsema(sp, val)              sema_init(sp, val)
-#define initnsema(sp, val, name)       sema_init(sp, val)
-#define psema(sp, b)                   down(sp)
-#define vsema(sp)                      up(sp)
-#define valusema(sp)                   (atomic_read(&(sp)->count))
-#define freesema(sema)
-
-/*
- * Map cpsema (try to get the sema) to down_trylock. We need to switch
- * the return values since cpsema returns 1 (acquired) 0 (failed) and
- * down_trylock returns the reverse 0 (acquired) 1 (failed).
- */
-
-#define cpsema(sp)                     (down_trylock(sp) ? 0 : 1)
-
-/*
- * Didn't do cvsema(sp). Not sure how to map this to up/down/...
- * It does a vsema if the values is < 0 other wise nothing.
- */
-
-#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/support/spin.h b/fs/xfs/support/spin.h
deleted file mode 100644 (file)
index 80a3a6b..0000000
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_SPIN_H__
-#define __XFS_SUPPORT_SPIN_H__
-
-#include <linux/sched.h>       /* preempt needs this */
-#include <linux/spinlock.h>
-
-/*
- * Map lock_t from IRIX to Linux spinlocks.
- *
- * Note that linux turns on/off spinlocks depending on CONFIG_SMP.
- * We don't need to worry about SMP or not here.
- */
-
-#define SPLDECL(s)             unsigned long s
-
-typedef spinlock_t lock_t;
-
-#define spinlock_init(lock, name)      spin_lock_init(lock)
-#define        spinlock_destroy(lock)
-
-static inline unsigned long mutex_spinlock(lock_t *lock)
-{
-       spin_lock(lock);
-       return 0;
-}
-
-/*ARGSUSED*/
-static inline void mutex_spinunlock(lock_t *lock, unsigned long s)
-{
-       spin_unlock(lock);
-}
-
-static inline void nested_spinlock(lock_t *lock)
-{
-       spin_lock(lock);
-}
-
-static inline void nested_spinunlock(lock_t *lock)
-{
-       spin_unlock(lock);
-}
-
-#endif /* __XFS_SUPPORT_SPIN_H__ */
diff --git a/fs/xfs/support/sv.h b/fs/xfs/support/sv.h
deleted file mode 100644 (file)
index 821d316..0000000
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-
-typedef struct sv_s {
-       wait_queue_head_t waiters;
-} sv_t;
-
-#define SV_FIFO                0x0             /* sv_t is FIFO type */
-#define SV_LIFO                0x2             /* sv_t is LIFO type */
-#define SV_PRIO                0x4             /* sv_t is PRIO type */
-#define SV_KEYED       0x6             /* sv_t is KEYED type */
-#define SV_DEFAULT      SV_FIFO
-
-
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
-                            unsigned long timeout)
-{
-       DECLARE_WAITQUEUE(wait, current);
-
-       add_wait_queue_exclusive(&sv->waiters, &wait);
-       __set_current_state(state);
-       spin_unlock(lock);
-
-       schedule_timeout(timeout);
-
-       remove_wait_queue(&sv->waiters, &wait);
-}
-
-#define init_sv(sv,type,name,flag) \
-       init_waitqueue_head(&(sv)->waiters)
-#define sv_init(sv,flag,name) \
-       init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
-       /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
-       _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_wait_sig(sv, pri, lock, s)   \
-       _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
-       _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
-       _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_signal(sv) \
-       wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
-       wake_up_all(&(sv)->waiters)
-
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/support/time.h b/fs/xfs/support/time.h
deleted file mode 100644 (file)
index 109b5c0..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_TIME_H__
-#define __XFS_SUPPORT_TIME_H__
-
-#include <linux/sched.h>
-#include <linux/time.h>
-
-typedef struct timespec timespec_t;
-
-static inline void delay(long ticks)
-{
-       current->state = TASK_UNINTERRUPTIBLE;
-       schedule_timeout(ticks);
-}
-
-static inline void nanotime(struct timespec *tvp)
-{
-       *tvp = CURRENT_TIME;
-}
-
-#endif /* __XFS_SUPPORT_TIME_H__ */
index a13aedbdddc9f7fc907a42c2458b39f006d0e2bd..7e44857648ab8beab8b9b49301e8d14b5d7e9429 100644 (file)
 #include <linux/types.h>
 #include <xfs_types.h>
 #include <xfs_arch.h>
-#include "time.h"
+#include <time.h>
+#include <kmem.h>
+#include <mutex.h>
 #include "uuid.h"
-#include "kmem.h"
 #include "debug.h"
-#include "mutex.h"
 
 static mutex_t uuid_monitor;
 static int     uuid_table_size;
index f9cfa5afa8cf0047c6ae0266c2b6a5feaf9902d8..d916806daf77308480f52aaff898a6721959783e 100644 (file)
 
 #include <xfs_arch.h>
 
-#include <support/kmem.h>
-#include <support/mrlock.h>
 #include <support/qsort.h>
-#include <support/spin.h>
-#include <support/sv.h>
 #include <support/ktrace.h>
-#include <support/mutex.h>
-#include <support/sema.h>
 #include <support/debug.h>
 #include <support/move.h>
 #include <support/uuid.h>
-#include <support/time.h>
 
 #include <linux/xfs_linux.h>
 
diff --git a/fs/xfs/xfs_behavior.c b/fs/xfs/xfs_behavior.c
new file mode 100644 (file)
index 0000000..16088e1
--- /dev/null
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ *
+ */
+#include "xfs.h"
+
+/*
+ * Source file used to associate/disassociate behaviors with virtualized
+ * objects.  See xfs_behavior.h for more information about behaviors, etc.
+ *
+ * The implementation is split between functions in this file and macros
+ * in xfs_behavior.h.
+ */
+
+/*
+ * Insert a new behavior descriptor into a behavior chain.
+ *
+ * The behavior chain is ordered based on the 'position' number which
+ * lives in the first field of the ops vector (higher numbers first).
+ *
+ * Attemps to insert duplicate ops result in an EINVAL return code.
+ * Otherwise, return 0 to indicate success.
+ */
+int
+bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp)
+{
+       bhv_desc_t      *curdesc, *prev;
+       int             position;
+
+       /*
+        * Validate the position value of the new behavior.
+        */
+       position = BHV_POSITION(bdp);
+       ASSERT(position >= BHV_POSITION_BASE && position <= BHV_POSITION_TOP);
+
+       /*
+        * Find location to insert behavior.  Check for duplicates.
+        */
+       prev = NULL;
+       for (curdesc = bhp->bh_first;
+            curdesc != NULL;
+            curdesc = curdesc->bd_next) {
+
+               /* Check for duplication. */
+               if (curdesc->bd_ops == bdp->bd_ops) {
+                       ASSERT(0);
+                       return EINVAL;
+               }
+
+               /* Find correct position */
+               if (position >= BHV_POSITION(curdesc)) {
+                       ASSERT(position != BHV_POSITION(curdesc));
+                       break;          /* found it */
+               }
+
+               prev = curdesc;
+       }
+
+       if (prev == NULL) {
+               /* insert at front of chain */
+               bdp->bd_next = bhp->bh_first;
+               bhp->bh_first = bdp;
+       } else {
+               /* insert after prev */
+               bdp->bd_next = prev->bd_next;
+               prev->bd_next = bdp;
+       }
+
+       return 0;
+}
+
+/*
+ * Remove a behavior descriptor from a position in a behavior chain;
+ * the postition is guaranteed not to be the first position.
+ * Should only be called by the bhv_remove() macro.
+ */
+void
+bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp)
+{
+       bhv_desc_t      *curdesc, *prev;
+
+       ASSERT(bhp->bh_first != NULL);
+       ASSERT(bhp->bh_first->bd_next != NULL);
+
+       prev = bhp->bh_first;
+       for (curdesc = bhp->bh_first->bd_next;
+            curdesc != NULL;
+            curdesc = curdesc->bd_next) {
+
+               if (curdesc == bdp)
+                       break;          /* found it */
+               prev = curdesc;
+       }
+
+       ASSERT(curdesc == bdp);
+       prev->bd_next = bdp->bd_next;   /* remove from after prev */
+}
+
+/*
+ * Look for a specific ops vector on the specified behavior chain.
+ * Return the associated behavior descriptor.  Or NULL, if not found.
+ */
+bhv_desc_t *
+bhv_lookup(bhv_head_t *bhp, void *ops)
+{
+       bhv_desc_t      *curdesc;
+
+       for (curdesc = bhp->bh_first;
+            curdesc != NULL;
+            curdesc = curdesc->bd_next) {
+
+               if (curdesc->bd_ops == ops)
+                       return curdesc;
+       }
+
+       return NULL;
+}
+
+/*
+ * Looks for the first behavior within a specified range of positions.
+ * Return the associated behavior descriptor.  Or NULL, if none found.
+ */
+bhv_desc_t *
+bhv_lookup_range(bhv_head_t *bhp, int low, int high)
+{
+       bhv_desc_t      *curdesc;
+
+       for (curdesc = bhp->bh_first;
+            curdesc != NULL;
+            curdesc = curdesc->bd_next) {
+
+               int     position = BHV_POSITION(curdesc);
+
+               if (position <= high) {
+                       if (position >= low)
+                               return curdesc;
+                       return NULL;
+               }
+       }
+
+       return NULL;
+}
+
+/*
+ * Return the base behavior in the chain, or NULL if the chain
+ * is empty.
+ *
+ * The caller has not read locked the behavior chain, so acquire the
+ * lock before traversing the chain.
+ */
+bhv_desc_t *
+bhv_base(bhv_head_t *bhp)
+{
+       bhv_desc_t      *curdesc;
+
+       for (curdesc = bhp->bh_first;
+            curdesc != NULL;
+            curdesc = curdesc->bd_next) {
+
+               if (curdesc->bd_next == NULL) {
+                       return curdesc;
+               }
+       }
+
+       return NULL;
+}
+
+void
+bhv_head_init(
+       bhv_head_t *bhp,
+       char *name)
+{
+       bhp->bh_first = NULL;
+}
+
+void
+bhv_insert_initial(
+       bhv_head_t *bhp,
+       bhv_desc_t *bdp)
+{
+       ASSERT(bhp->bh_first == NULL);
+       (bhp)->bh_first = bdp;
+}
+
+void
+bhv_head_destroy(
+       bhv_head_t *bhp)
+{
+       ASSERT(bhp->bh_first == NULL);
+}
diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h
new file mode 100644 (file)
index 0000000..d5ed5a8
--- /dev/null
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BEHAVIOR_H__
+#define __XFS_BEHAVIOR_H__
+
+/*
+ * Header file used to associate behaviors with virtualized objects.
+ *
+ * A virtualized object is an internal, virtualized representation of
+ * OS entities such as persistent files, processes, or sockets.  Examples
+ * of virtualized objects include vnodes, vprocs, and vsockets.  Often
+ * a virtualized object is referred to simply as an "object."
+ *
+ * A behavior is essentially an implementation layer associated with
+ * an object.  Multiple behaviors for an object are chained together,
+ * the order of chaining determining the order of invocation.  Each
+ * behavior of a given object implements the same set of interfaces
+ * (e.g., the VOP interfaces).
+ *
+ * Behaviors may be dynamically inserted into an object's behavior chain,
+ * such that the addition is transparent to consumers that already have
+ * references to the object.  Typically, a given behavior will be inserted
+ * at a particular location in the behavior chain.  Insertion of new
+ * behaviors is synchronized with operations-in-progress (oip's) so that
+ * the oip's always see a consistent view of the chain.
+ *
+ * The term "interpostion" is used to refer to the act of inserting
+ * a behavior such that it interposes on (i.e., is inserted in front
+ * of) a particular other behavior.  A key example of this is when a
+ * system implementing distributed single system image wishes to
+ * interpose a distribution layer (providing distributed coherency)
+ * in front of an object that is otherwise only accessed locally.
+ *
+ * Note that the traditional vnode/inode combination is simply a virtualized
+ * object that has exactly one associated behavior.
+ *
+ * Behavior synchronization is logic which is necessary under certain
+ * circumstances that there is no conflict between ongoing operations
+ * traversing the behavior chain and those dunamically modifying the
+ * behavior chain.  Because behavior synchronization adds extra overhead
+ * to virtual operation invocation, we want to restrict, as much as
+ * we can, the requirement for this extra code, to those situations
+ * in which it is truly necessary.
+ *
+ * Behavior synchronization is needed whenever there's at least one class
+ * of object in the system for which:
+ * 1) multiple behaviors for a given object are supported,
+ * -- AND --
+ * 2a) insertion of a new behavior can happen dynamically at any time during
+ *     the life of an active object,
+ *     -- AND --
+ *     3a) insertion of a new behavior needs to synchronize with existing
+ *         ops-in-progress.
+ *     -- OR --
+ *     3b) multiple different behaviors can be dynamically inserted at
+ *         any time during the life of an active object
+ *     -- OR --
+ *     3c) removal of a behavior can occur at any time during the life of
+ *         an active object.
+ * -- OR --
+ * 2b) removal of a behavior can occur at any time during the life of an
+ *     active object
+ *
+ */
+
+struct bhv_head_lock;
+
+/*
+ * Behavior head.  Head of the chain of behaviors.
+ * Contained within each virtualized object data structure.
+ */
+typedef struct bhv_head {
+       struct bhv_desc *bh_first;      /* first behavior in chain */
+       struct bhv_head_lock *bh_lockp; /* pointer to lock info struct */
+} bhv_head_t;
+
+/*
+ * Behavior descriptor.         Descriptor associated with each behavior.
+ * Contained within the behavior's private data structure.
+ */
+typedef struct bhv_desc {
+       void            *bd_pdata;      /* private data for this behavior */
+       void            *bd_vobj;       /* virtual object associated with */
+       void            *bd_ops;        /* ops for this behavior */
+       struct bhv_desc *bd_next;       /* next behavior in chain */
+} bhv_desc_t;
+
+/*
+ * Behavior identity field.  A behavior's identity determines the position
+ * where it lives within a behavior chain, and it's always the first field
+ * of the behavior's ops vector. The optional id field further identifies the
+ * subsystem responsible for the behavior.
+ */
+typedef struct bhv_identity {
+       __u16   bi_id;          /* owning subsystem id */
+       __u16   bi_position;    /* position in chain */
+} bhv_identity_t;
+
+typedef bhv_identity_t bhv_position_t;
+
+#define BHV_IDENTITY_INIT(id,pos)      {id, pos}
+#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos)
+
+/*
+ * Define boundaries of position values.
+ */
+#define BHV_POSITION_INVALID   0       /* invalid position number */
+#define BHV_POSITION_BASE      1       /* base (last) implementation layer */
+#define BHV_POSITION_TOP       63      /* top (first) implementation layer */
+
+/*
+ * Plumbing macros.
+ */
+#define BHV_HEAD_FIRST(bhp)    (ASSERT((bhp)->bh_first), (bhp)->bh_first)
+#define BHV_NEXT(bdp)          (ASSERT((bdp)->bd_next), (bdp)->bd_next)
+#define BHV_NEXTNULL(bdp)      ((bdp)->bd_next)
+#define BHV_VOBJ(bdp)          (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj)
+#define BHV_VOBJNULL(bdp)      ((bdp)->bd_vobj)
+#define BHV_PDATA(bdp)         (bdp)->bd_pdata
+#define BHV_OPS(bdp)           (bdp)->bd_ops
+#define BHV_IDENTITY(bdp)      ((bhv_identity_t *)(bdp)->bd_ops)
+#define BHV_POSITION(bdp)      (BHV_IDENTITY(bdp)->bi_position)
+
+extern void bhv_head_init(bhv_head_t *, char *);
+extern void bhv_head_destroy(bhv_head_t *);
+extern int  bhv_insert(bhv_head_t *, bhv_desc_t *);
+extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *);
+
+/*
+ * Initialize a new behavior descriptor.
+ * Arguments:
+ *   bdp - pointer to behavior descriptor
+ *   pdata - pointer to behavior's private data
+ *   vobj - pointer to associated virtual object
+ *   ops - pointer to ops for this behavior
+ */
+#define bhv_desc_init(bdp, pdata, vobj, ops)           \
+ {                                                     \
+       (bdp)->bd_pdata = pdata;                        \
+       (bdp)->bd_vobj = vobj;                          \
+       (bdp)->bd_ops = ops;                            \
+       (bdp)->bd_next = NULL;                          \
+ }
+
+/*
+ * Remove a behavior descriptor from a behavior chain.
+ */
+#define bhv_remove(bhp, bdp)                           \
+ {                                                     \
+       if ((bhp)->bh_first == (bdp)) {                 \
+               /*                                      \
+               * Remove from front of chain.           \
+               * Atomic wrt oip's.                     \
+               */                                      \
+              (bhp)->bh_first = (bdp)->bd_next;        \
+       } else {                                        \
+              /* remove from non-front of chain */     \
+              bhv_remove_not_first(bhp, bdp);          \
+       }                                               \
+       (bdp)->bd_vobj = NULL;                          \
+ }
+
+/*
+ * Behavior module prototypes.
+ */
+extern void            bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp);
+extern bhv_desc_t *    bhv_lookup(bhv_head_t *bhp, void *ops);
+extern bhv_desc_t *    bhv_lookup_range(bhv_head_t *bhp, int low, int high);
+extern bhv_desc_t *    bhv_base(bhv_head_t *bhp);
+
+/* No bhv locking on Linux */
+#define bhv_lookup_unlocked    bhv_lookup
+#define bhv_base_unlocked      bhv_base
+
+#endif /* __XFS_BEHAVIOR_H__ */
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
deleted file mode 100644 (file)
index 4254459..0000000
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.         Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_BUF_H__
-#define __XFS_BUF_H__
-
-/* These are just for xfs_syncsub... it sets an internal variable
- * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t
- */
-#define XFS_B_ASYNC            PBF_ASYNC
-#define XFS_B_DELWRI           PBF_DELWRI
-#define XFS_B_READ             PBF_READ
-#define XFS_B_WRITE            PBF_WRITE
-#define XFS_B_STALE            PBF_STALE
-
-#define XFS_BUF_TRYLOCK                PBF_TRYLOCK
-#define XFS_INCORE_TRYLOCK     PBF_TRYLOCK
-#define XFS_BUF_LOCK           PBF_LOCK
-#define XFS_BUF_MAPPED         PBF_MAPPED
-
-#define BUF_BUSY               PBF_DONT_BLOCK
-
-#define XFS_BUF_BFLAGS(x)      ((x)->pb_flags)
-#define XFS_BUF_ZEROFLAGS(x)   \
-       ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_DELWRI))
-
-#define XFS_BUF_STALE(x)       ((x)->pb_flags |= XFS_B_STALE)
-#define XFS_BUF_UNSTALE(x)     ((x)->pb_flags &= ~XFS_B_STALE)
-#define XFS_BUF_ISSTALE(x)     ((x)->pb_flags & XFS_B_STALE)
-#define XFS_BUF_SUPER_STALE(x) do {                            \
-                                       XFS_BUF_STALE(x);       \
-                                       xfs_buf_undelay(x);     \
-                                       XFS_BUF_DONE(x);        \
-                               } while (0)
-
-#define XFS_BUF_MANAGE         PBF_FS_MANAGED
-#define XFS_BUF_UNMANAGE(x)    ((x)->pb_flags &= ~PBF_FS_MANAGED)
-
-static inline void xfs_buf_undelay(page_buf_t *pb)
-{
-       if (pb->pb_flags & PBF_DELWRI) {
-               if (pb->pb_list.next != &pb->pb_list) {
-                       pagebuf_delwri_dequeue(pb);
-                       pagebuf_rele(pb);
-               } else {
-                       pb->pb_flags &= ~PBF_DELWRI;
-               }
-       }
-}
-
-#define XFS_BUF_DELAYWRITE(x)   ((x)->pb_flags |= PBF_DELWRI)
-#define XFS_BUF_UNDELAYWRITE(x)         xfs_buf_undelay(x)
-#define XFS_BUF_ISDELAYWRITE(x)         ((x)->pb_flags & PBF_DELWRI)
-
-#define XFS_BUF_ERROR(x,no)     pagebuf_ioerror(x,no)
-#define XFS_BUF_GETERROR(x)     pagebuf_geterror(x)
-#define XFS_BUF_ISERROR(x)      (pagebuf_geterror(x)?1:0)
-
-#define XFS_BUF_DONE(x)                 ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE))
-#define XFS_BUF_UNDONE(x)       ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE)
-#define XFS_BUF_ISDONE(x)       (!(PBF_NOT_DONE(x)))
-
-#define XFS_BUF_BUSY(x)                 ((x)->pb_flags |= PBF_FORCEIO)
-#define XFS_BUF_UNBUSY(x)       ((x)->pb_flags &= ~PBF_FORCEIO)
-#define XFS_BUF_ISBUSY(x)       (1)
-
-#define XFS_BUF_ASYNC(x)        ((x)->pb_flags |= PBF_ASYNC)
-#define XFS_BUF_UNASYNC(x)      ((x)->pb_flags &= ~PBF_ASYNC)
-#define XFS_BUF_ISASYNC(x)      ((x)->pb_flags & PBF_ASYNC)
-
-#define XFS_BUF_FLUSH(x)        ((x)->pb_flags |= PBF_FLUSH)
-#define XFS_BUF_UNFLUSH(x)      ((x)->pb_flags &= ~PBF_FLUSH)
-#define XFS_BUF_ISFLUSH(x)      ((x)->pb_flags & PBF_FLUSH)
-
-#define XFS_BUF_SHUT(x)                 printk("XFS_BUF_SHUT not implemented yet\n")
-#define XFS_BUF_UNSHUT(x)       printk("XFS_BUF_UNSHUT not implemented yet\n")
-#define XFS_BUF_ISSHUT(x)       (0)
-
-#define XFS_BUF_HOLD(x)                pagebuf_hold(x)
-#define XFS_BUF_READ(x)                ((x)->pb_flags |= PBF_READ)
-#define XFS_BUF_UNREAD(x)      ((x)->pb_flags &= ~PBF_READ)
-#define XFS_BUF_ISREAD(x)      ((x)->pb_flags & PBF_READ)
-
-#define XFS_BUF_WRITE(x)       ((x)->pb_flags |= PBF_WRITE)
-#define XFS_BUF_UNWRITE(x)     ((x)->pb_flags &= ~PBF_WRITE)
-#define XFS_BUF_ISWRITE(x)     ((x)->pb_flags & PBF_WRITE)
-
-#define XFS_BUF_ISUNINITIAL(x)  (0)
-#define XFS_BUF_UNUNINITIAL(x)  (0)
-
-#define XFS_BUF_BP_ISMAPPED(bp)         1
-
-typedef struct page_buf_s xfs_buf_t;
-#define xfs_buf page_buf_s
-
-typedef struct pb_target xfs_buftarg_t;
-#define xfs_buftarg pb_target
-
-#define XFS_BUF_DATAIO(x)      ((x)->pb_flags |= PBF_FS_DATAIOD)
-#define XFS_BUF_UNDATAIO(x)    ((x)->pb_flags &= ~PBF_FS_DATAIOD)
-
-#define XFS_BUF_IODONE_FUNC(buf)       (buf)->pb_iodone
-#define XFS_BUF_SET_IODONE_FUNC(buf, func)     \
-                       (buf)->pb_iodone = (func)
-#define XFS_BUF_CLR_IODONE_FUNC(buf)           \
-                       (buf)->pb_iodone = NULL
-#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func)    \
-                       (buf)->pb_strat = (func)
-#define XFS_BUF_CLR_BDSTRAT_FUNC(buf)          \
-                       (buf)->pb_strat = NULL
-
-#define XFS_BUF_FSPRIVATE(buf, type)           \
-                       ((type)(buf)->pb_fspriv)
-#define XFS_BUF_SET_FSPRIVATE(buf, value)      \
-                       (buf)->pb_fspriv = (void *)(value)
-#define XFS_BUF_FSPRIVATE2(buf, type)          \
-                       ((type)(buf)->pb_fspriv2)
-#define XFS_BUF_SET_FSPRIVATE2(buf, value)     \
-                       (buf)->pb_fspriv2 = (void *)(value)
-#define XFS_BUF_FSPRIVATE3(buf, type)          \
-                       ((type)(buf)->pb_fspriv3)
-#define XFS_BUF_SET_FSPRIVATE3(buf, value)     \
-                       (buf)->pb_fspriv3  = (void *)(value)
-#define XFS_BUF_SET_START(buf)
-
-#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \
-                       (buf)->pb_relse = (value)
-
-#define XFS_BUF_PTR(bp)                (xfs_caddr_t)((bp)->pb_addr)
-
-extern inline xfs_caddr_t xfs_buf_offset(page_buf_t *bp, size_t offset)
-{
-       if (bp->pb_flags & PBF_MAPPED)
-               return XFS_BUF_PTR(bp) + offset;
-       return (xfs_caddr_t) pagebuf_offset(bp, offset);
-}
-
-#define XFS_BUF_SET_PTR(bp, val, count)                \
-                               pagebuf_associate_memory(bp, val, count)
-#define XFS_BUF_ADDR(bp)       ((bp)->pb_bn)
-#define XFS_BUF_SET_ADDR(bp, blk)              \
-                       ((bp)->pb_bn = (page_buf_daddr_t)(blk))
-#define XFS_BUF_OFFSET(bp)     ((bp)->pb_file_offset)
-#define XFS_BUF_SET_OFFSET(bp, off)            \
-                       ((bp)->pb_file_offset = (off))
-#define XFS_BUF_COUNT(bp)      ((bp)->pb_count_desired)
-#define XFS_BUF_SET_COUNT(bp, cnt)             \
-                       ((bp)->pb_count_desired = (cnt))
-#define XFS_BUF_SIZE(bp)       ((bp)->pb_buffer_length)
-#define XFS_BUF_SET_SIZE(bp, cnt)              \
-                       ((bp)->pb_buffer_length = (cnt))
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
-#define XFS_BUF_SET_VTYPE(bp, type)
-#define XFS_BUF_SET_REF(bp, ref)
-
-#define XFS_BUF_ISPINNED(bp)   pagebuf_ispin(bp)
-
-#define XFS_BUF_VALUSEMA(bp)   pagebuf_lock_value(bp)
-#define XFS_BUF_CPSEMA(bp)     (pagebuf_cond_lock(bp) == 0)
-#define XFS_BUF_VSEMA(bp)      pagebuf_unlock(bp)
-#define XFS_BUF_PSEMA(bp,x)    pagebuf_lock(bp)
-#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema);
-
-/* setup the buffer target from a buftarg structure */
-#define XFS_BUF_SET_TARGET(bp, target) \
-               (bp)->pb_target = (target)
-#define XFS_BUF_TARGET(bp)     ((bp)->pb_target)
-#define XFS_BUFTARG_NAME(target)       \
-               pagebuf_target_name(target)
-
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
-#define XFS_BUF_SET_VTYPE(bp, type)
-#define XFS_BUF_SET_REF(bp, ref)
-
-#define xfs_buf_read(target, blkno, len, flags) \
-               pagebuf_get((target), (blkno), (len), \
-                       PBF_LOCK | PBF_READ | PBF_MAPPED | PBF_MAPPABLE)
-#define xfs_buf_get(target, blkno, len, flags) \
-               pagebuf_get((target), (blkno), (len), \
-                       PBF_LOCK | PBF_MAPPED | PBF_MAPPABLE)
-
-#define xfs_buf_read_flags(target, blkno, len, flags) \
-               pagebuf_get((target), (blkno), (len), \
-                       PBF_READ | PBF_MAPPABLE | flags)
-#define xfs_buf_get_flags(target, blkno, len, flags) \
-               pagebuf_get((target), (blkno), (len), \
-                       PBF_MAPPABLE | flags)
-
-static inline int      xfs_bawrite(void *mp, page_buf_t *bp)
-{
-       bp->pb_fspriv3 = mp;
-       bp->pb_strat = xfs_bdstrat_cb;
-       xfs_buf_undelay(bp);
-       return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | PBF_RUN_QUEUES);
-}
-
-static inline void     xfs_buf_relse(page_buf_t *bp)
-{
-       if ((bp->pb_flags & _PBF_LOCKABLE) && !bp->pb_relse)
-               pagebuf_unlock(bp);
-       pagebuf_rele(bp);
-}
-
-#define xfs_bpin(bp)           pagebuf_pin(bp)
-#define xfs_bunpin(bp)         pagebuf_unpin(bp)
-
-#define xfs_buftrace(id, bp)   \
-           pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
-
-#define xfs_biodone(pb)                    \
-           pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), 0)
-
-#define xfs_incore(buftarg,blkno,len,lockit) \
-           pagebuf_find(buftarg, blkno ,len, lockit)
-
-
-#define xfs_biomove(pb, off, len, data, rw) \
-           pagebuf_iomove((pb), (off), (len), (data), \
-               ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ)
-
-#define xfs_biozero(pb, off, len) \
-           pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO)
-
-
-static inline int      XFS_bwrite(page_buf_t *pb)
-{
-       int     iowait = (pb->pb_flags & PBF_ASYNC) == 0;
-       int     error = 0;
-
-       pb->pb_flags |= PBF_SYNC;
-       if (!iowait)
-               pb->pb_flags |= PBF_RUN_QUEUES;
-
-       xfs_buf_undelay(pb);
-       pagebuf_iostrategy(pb);
-       if (iowait) {
-               error = pagebuf_iowait(pb);
-               xfs_buf_relse(pb);
-       }
-       return error;
-}
-
-#define XFS_bdwrite(pb)                     \
-           pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC)
-
-static inline int xfs_bdwrite(void *mp, page_buf_t *bp)
-{
-       bp->pb_strat = xfs_bdstrat_cb;
-       bp->pb_fspriv3 = mp;
-
-       return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC);
-}
-
-#define XFS_bdstrat(bp) pagebuf_iorequest(bp)
-
-#define xfs_iowait(pb) pagebuf_iowait(pb)
-
-
-/*
- * Go through all incore buffers, and release buffers
- * if they belong to the given device. This is used in
- * filesystem error handling to preserve the consistency
- * of its metadata.
- */
-
-#define xfs_binval(buftarg)    xfs_flush_buftarg(buftarg)
-
-#define XFS_bflush(buftarg)    xfs_flush_buftarg(buftarg)
-
-#define xfs_incore_relse(buftarg,delwri_only,wait)     \
-       xfs_relse_buftarg(buftarg)
-
-#define xfs_baread(target, rablkno, ralen)  \
-       pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK)
-
-#define xfs_buf_get_empty(len, target) pagebuf_get_empty((len), (target))
-#define xfs_buf_get_noaddr(len, target)        pagebuf_get_no_daddr((len), (target))
-#define xfs_buf_free(bp)               pagebuf_free(bp)
-
-#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
new file mode 100644 (file)
index 0000000..ae4818a
--- /dev/null
@@ -0,0 +1,795 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.         Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "xfs.h"
+
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_utils.h"
+#include "xfs_iomap.h"
+
+#define XFS_WRITEIO_ALIGN(mp,off)      (((off) >> mp->m_writeio_log) \
+                                               << mp->m_writeio_log)
+#define XFS_STRAT_WRITE_IMAPS  2
+#define XFS_WRITE_IMAPS                XFS_BMAP_MAX_NMAP
+
+STATIC int
+xfs_imap_to_bmap(
+       xfs_iocore_t    *io,
+       xfs_off_t       offset,
+       xfs_bmbt_irec_t *imap,
+       xfs_iomap_t     *iomapp,
+       int             imaps,                  /* Number of imap entries */
+       int             iomaps,                 /* Number of iomap entries */
+       int             flags)
+{
+       xfs_mount_t     *mp;
+       xfs_fsize_t     nisize;
+       int             pbm;
+       xfs_fsblock_t   start_block;
+
+       mp = io->io_mount;
+       nisize = XFS_SIZE(mp, io);
+       if (io->io_new_size > nisize)
+               nisize = io->io_new_size;
+
+       for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
+               iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ?
+                       mp->m_rtdev_targp : mp->m_ddev_targp;
+               iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+               iomapp->iomap_delta = offset - iomapp->iomap_offset;
+               iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
+               iomapp->iomap_flags = flags;
+
+               start_block = imap->br_startblock;
+               if (start_block == HOLESTARTBLOCK) {
+                       iomapp->iomap_bn = IOMAP_DADDR_NULL;
+                       iomapp->iomap_flags = IOMAP_HOLE;
+               } else if (start_block == DELAYSTARTBLOCK) {
+                       iomapp->iomap_bn = IOMAP_DADDR_NULL;
+                       iomapp->iomap_flags = IOMAP_DELAY;
+               } else {
+                       iomapp->iomap_bn = XFS_FSB_TO_DB_IO(io, start_block);
+                       if (ISUNWRITTEN(imap))
+                               iomapp->iomap_flags |= IOMAP_UNWRITTEN;
+               }
+
+               if ((iomapp->iomap_offset + iomapp->iomap_bsize) >= nisize) {
+                       iomapp->iomap_flags |= IOMAP_EOF;
+               }
+
+               offset += iomapp->iomap_bsize - iomapp->iomap_delta;
+       }
+       return pbm;     /* Return the number filled */
+}
+
+int
+xfs_iomap(
+       xfs_iocore_t    *io,
+       xfs_off_t       offset,
+       ssize_t         count,
+       int             flags,
+       xfs_iomap_t     *iomapp,
+       int             *niomaps)
+{
+       xfs_mount_t     *mp = io->io_mount;
+       xfs_fileoff_t   offset_fsb, end_fsb;
+       int             error = 0;
+       int             lockmode = 0;
+       xfs_bmbt_irec_t imap;
+       int             nimaps = 1;
+       int             bmapi_flags = 0;
+       int             iomap_flags = 0;
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       switch (flags &
+               (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE |
+                BMAPI_UNWRITTEN | BMAPI_DEVICE)) {
+       case BMAPI_READ:
+               lockmode = XFS_LCK_MAP_SHARED(mp, io);
+               bmapi_flags = XFS_BMAPI_ENTIRE;
+               if (flags & BMAPI_IGNSTATE)
+                       bmapi_flags |= XFS_BMAPI_IGSTATE;
+               break;
+       case BMAPI_WRITE:
+               lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR;
+               bmapi_flags = 0;
+               XFS_ILOCK(mp, io, lockmode);
+               break;
+       case BMAPI_ALLOCATE:
+               lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD;
+               bmapi_flags = XFS_BMAPI_ENTIRE;
+               /* Attempt non-blocking lock */
+               if (flags & BMAPI_TRYLOCK) {
+                       if (!XFS_ILOCK_NOWAIT(mp, io, lockmode))
+                               return XFS_ERROR(EAGAIN);
+               } else {
+                       XFS_ILOCK(mp, io, lockmode);
+               }
+               break;
+       case BMAPI_UNWRITTEN:
+               goto phase2;
+       case BMAPI_DEVICE:
+               lockmode = XFS_LCK_MAP_SHARED(mp, io);
+               iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ?
+                       mp->m_rtdev_targp : mp->m_ddev_targp;
+               error = 0;
+               *niomaps = 1;
+               goto out;
+       default:
+               BUG();
+       }
+
+       ASSERT(offset <= mp->m_maxioffset);
+       if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
+               count = mp->m_maxioffset - offset;
+       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+       offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+       error = XFS_BMAPI(mp, NULL, io, offset_fsb,
+                       (xfs_filblks_t)(end_fsb - offset_fsb),
+                       bmapi_flags,  NULL, 0, &imap,
+                       &nimaps, NULL);
+
+       if (error)
+               goto out;
+
+phase2:
+       switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) {
+       case BMAPI_WRITE:
+               /* If we found an extent, return it */
+               if (nimaps && (imap.br_startblock != HOLESTARTBLOCK))
+                       break;
+
+               if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
+                       error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset,
+                                       count, flags, &imap, &nimaps, nimaps);
+               } else {
+                       error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count,
+                                       flags, &imap, &nimaps);
+               }
+               iomap_flags = IOMAP_NEW;
+               break;
+       case BMAPI_ALLOCATE:
+               /* If we found an extent, return it */
+               XFS_IUNLOCK(mp, io, lockmode);
+               lockmode = 0;
+
+               if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock))
+                       break;
+
+               error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, &imap, &nimaps);
+               break;
+       case BMAPI_UNWRITTEN:
+               lockmode = 0;
+               error = XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count);
+               nimaps = 0;
+               break;
+       }
+
+       if (nimaps) {
+               *niomaps = xfs_imap_to_bmap(io, offset, &imap,
+                                       iomapp, nimaps, *niomaps, iomap_flags);
+       } else if (niomaps) {
+               *niomaps = 0;
+       }
+
+out:
+       if (lockmode)
+               XFS_IUNLOCK(mp, io, lockmode);
+       return XFS_ERROR(error);
+}
+
+STATIC int
+xfs_flush_space(
+       xfs_inode_t     *ip,
+       int             *fsynced,
+       int             *ioflags)
+{
+       switch (*fsynced) {
+       case 0:
+               if (ip->i_delayed_blks) {
+                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                       xfs_flush_inode(ip);
+                       xfs_ilock(ip, XFS_ILOCK_EXCL);
+                       *fsynced = 1;
+               } else {
+                       *ioflags |= BMAPI_SYNC;
+                       *fsynced = 2;
+               }
+               return 0;
+       case 1:
+               *fsynced = 2;
+               *ioflags |= BMAPI_SYNC;
+               return 0;
+       case 2:
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               xfs_flush_device(ip);
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               *fsynced = 3;
+               return 0;
+       }
+       return 1;
+}
+
+int
+xfs_iomap_write_direct(
+       xfs_inode_t     *ip,
+       loff_t          offset,
+       size_t          count,
+       int             flags,
+       xfs_bmbt_irec_t *ret_imap,
+       int             *nmaps,
+       int             found)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_iocore_t    *io = &ip->i_iocore;
+       xfs_fileoff_t   offset_fsb;
+       xfs_fileoff_t   last_fsb;
+       xfs_filblks_t   count_fsb;
+       xfs_fsize_t     isize;
+       xfs_fsblock_t   firstfsb;
+       int             nimaps, maps;
+       int             error;
+       int             bmapi_flag;
+       int             rt;
+       xfs_trans_t     *tp;
+       xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp;
+       xfs_bmap_free_t free_list;
+       int             aeof;
+       xfs_filblks_t   datablocks;
+       int             committed;
+       int             numrtextents;
+       uint            resblks;
+
+       /*
+        * Make sure that the dquots are there. This doesn't hold
+        * the ilock across a disk read.
+        */
+
+       error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED);
+       if (error)
+               return XFS_ERROR(error);
+
+       maps = min(XFS_WRITE_IMAPS, *nmaps);
+       nimaps = maps;
+
+       isize = ip->i_d.di_size;
+       aeof = (offset + count) > isize;
+
+       if (io->io_new_size > isize)
+               isize = io->io_new_size;
+
+       offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+       count_fsb = last_fsb - offset_fsb;
+       if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) {
+               xfs_fileoff_t   map_last_fsb;
+
+               map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff;
+
+               if (map_last_fsb < last_fsb) {
+                       last_fsb = map_last_fsb;
+                       count_fsb = last_fsb - offset_fsb;
+               }
+               ASSERT(count_fsb > 0);
+       }
+
+       /*
+        * determine if reserving space on
+        * the data or realtime partition.
+        */
+       if ((rt = XFS_IS_REALTIME_INODE(ip))) {
+               int     sbrtextsize, iprtextsize;
+
+               sbrtextsize = mp->m_sb.sb_rextsize;
+               iprtextsize =
+                       ip->i_d.di_extsize ? ip->i_d.di_extsize : sbrtextsize;
+               numrtextents = (count_fsb + iprtextsize - 1);
+               do_div(numrtextents, sbrtextsize);
+               datablocks = 0;
+       } else {
+               datablocks = count_fsb;
+               numrtextents = 0;
+       }
+
+       /*
+        * allocate and setup the transaction
+        */
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+
+       resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
+
+       error = xfs_trans_reserve(tp, resblks,
+                       XFS_WRITE_LOG_RES(mp), numrtextents,
+                       XFS_TRANS_PERM_LOG_RES,
+                       XFS_WRITE_LOG_COUNT);
+
+       /*
+        * check for running out of space
+        */
+       if (error)
+               /*
+                * Free the transaction structure.
+                */
+               xfs_trans_cancel(tp, 0);
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+       if (error)
+               goto error_out; /* Don't return in above if .. trans ..,
+                                       need lock to return */
+
+       if (XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, resblks)) {
+               error = (EDQUOT);
+               goto error1;
+       }
+       nimaps = 1;
+
+       bmapi_flag = XFS_BMAPI_WRITE;
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+       xfs_trans_ihold(tp, ip);
+
+       if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt))
+               bmapi_flag |= XFS_BMAPI_PREALLOC;
+
+       /*
+        * issue the bmapi() call to allocate the blocks
+        */
+       XFS_BMAP_INIT(&free_list, &firstfsb);
+       imapp = &imap[0];
+       error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+               bmapi_flag, &firstfsb, 0, imapp, &nimaps, &free_list);
+       if (error) {
+               goto error0;
+       }
+
+       /*
+        * complete the transaction
+        */
+
+       error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
+       if (error) {
+               goto error0;
+       }
+
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+       if (error) {
+               goto error_out;
+       }
+
+       /* copy any maps to caller's array and return any error. */
+       if (nimaps == 0) {
+               error = (ENOSPC);
+               goto error_out;
+       }
+
+       *ret_imap = imap[0];
+       *nmaps = 1;
+       return 0;
+
+ error0:       /* Cancel bmap, unlock inode, and cancel trans */
+       xfs_bmap_cancel(&free_list);
+
+ error1:       /* Just cancel transaction */
+       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       *nmaps = 0;     /* nothing set-up here */
+
+error_out:
+       return XFS_ERROR(error);
+}
+
+int
+xfs_iomap_write_delay(
+       xfs_inode_t     *ip,
+       loff_t          offset,
+       size_t          count,
+       int             ioflag,
+       xfs_bmbt_irec_t *ret_imap,
+       int             *nmaps)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_iocore_t    *io = &ip->i_iocore;
+       xfs_fileoff_t   offset_fsb;
+       xfs_fileoff_t   last_fsb;
+       xfs_fsize_t     isize;
+       xfs_fsblock_t   firstblock;
+       int             nimaps;
+       int             error;
+       xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
+       int             aeof;
+       int             fsynced = 0;
+
+       ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
+
+       /*
+        * Make sure that the dquots are there. This doesn't hold
+        * the ilock across a disk read.
+        */
+
+       error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+       if (error)
+               return XFS_ERROR(error);
+
+retry:
+       isize = ip->i_d.di_size;
+       if (io->io_new_size > isize) {
+               isize = io->io_new_size;
+       }
+
+       aeof = 0;
+       offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+       /*
+        * If the caller is doing a write at the end of the file,
+        * then extend the allocation (and the buffer used for the write)
+        * out to the file system's write iosize.  We clean up any extra
+        * space left over when the file is closed in xfs_inactive().
+        *
+        * We don't bother with this for sync writes, because we need
+        * to minimize the amount we write for good performance.
+        */
+       if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) {
+               xfs_off_t       aligned_offset;
+               unsigned int    iosize;
+               xfs_fileoff_t   ioalign;
+
+               iosize = mp->m_writeio_blocks;
+               aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
+               ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
+               last_fsb = ioalign + iosize;
+               aeof = 1;
+       }
+
+       nimaps = XFS_WRITE_IMAPS;
+       firstblock = NULLFSBLOCK;
+
+       /*
+        * roundup the allocation request to m_dalign boundary if file size
+        * is greater that 512K and we are allocating past the allocation eof
+        */
+       if (mp->m_dalign && (isize >= mp->m_dalign) && aeof) {
+               int eof;
+               xfs_fileoff_t new_last_fsb;
+               new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
+               error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
+               if (error) {
+                       return error;
+               }
+               if (eof) {
+                       last_fsb = new_last_fsb;
+               }
+       }
+
+       error = xfs_bmapi(NULL, ip, offset_fsb,
+                         (xfs_filblks_t)(last_fsb - offset_fsb),
+                         XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
+                         XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
+                         &nimaps, NULL);
+       /*
+        * This can be EDQUOT, if nimaps == 0
+        */
+       if (error && (error != ENOSPC)) {
+               return XFS_ERROR(error);
+       }
+       /*
+        * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+        * then we must have run out of space.
+        */
+
+       if (nimaps == 0) {
+               if (xfs_flush_space(ip, &fsynced, &ioflag))
+                       return XFS_ERROR(ENOSPC);
+
+               error = 0;
+               goto retry;
+       }
+
+       *ret_imap = imap[0];
+       *nmaps = 1;
+       return 0;
+}
+
+/*
+ * Pass in a delayed allocate extent, convert it to real extents;
+ * return to the caller the extent we create which maps on top of
+ * the originating callers request.
+ *
+ * Called without a lock on the inode.
+ */
+int
+xfs_iomap_write_allocate(
+       xfs_inode_t     *ip,
+       xfs_bmbt_irec_t *map,
+       int             *retmap)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_fileoff_t   offset_fsb, last_block;
+       xfs_fileoff_t   end_fsb, map_start_fsb;
+       xfs_fsblock_t   first_block;
+       xfs_bmap_free_t free_list;
+       xfs_filblks_t   count_fsb;
+       xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS];
+       xfs_trans_t     *tp;
+       int             i, nimaps, committed;
+       int             error = 0;
+       int             nres;
+
+       *retmap = 0;
+
+       /*
+        * Make sure that the dquots are there.
+        */
+
+       if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+               return XFS_ERROR(error);
+
+       offset_fsb = map->br_startoff;
+       count_fsb = map->br_blockcount;
+       map_start_fsb = offset_fsb;
+
+       XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
+
+       while (count_fsb != 0) {
+               /*
+                * Set up a transaction with which to allocate the
+                * backing store for the file.  Do allocations in a
+                * loop until we get some space in the range we are
+                * interested in.  The other space that might be allocated
+                * is in the delayed allocation extent on which we sit
+                * but before our buffer starts.
+                */
+
+               nimaps = 0;
+               while (nimaps == 0) {
+                       tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+                       nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+                       error = xfs_trans_reserve(tp, nres,
+                                       XFS_WRITE_LOG_RES(mp),
+                                       0, XFS_TRANS_PERM_LOG_RES,
+                                       XFS_WRITE_LOG_COUNT);
+
+                       if (error == ENOSPC) {
+                               error = xfs_trans_reserve(tp, 0,
+                                               XFS_WRITE_LOG_RES(mp),
+                                               0,
+                                               XFS_TRANS_PERM_LOG_RES,
+                                               XFS_WRITE_LOG_COUNT);
+                       }
+                       if (error) {
+                               xfs_trans_cancel(tp, 0);
+                               return XFS_ERROR(error);
+                       }
+                       xfs_ilock(ip, XFS_ILOCK_EXCL);
+                       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                       xfs_trans_ihold(tp, ip);
+
+                       XFS_BMAP_INIT(&free_list, &first_block);
+
+                       nimaps = XFS_STRAT_WRITE_IMAPS;
+                       /*
+                        * Ensure we don't go beyond eof - it is possible
+                        * the extents changed since we did the read call,
+                        * we dropped the ilock in the interim.
+                        */
+
+                       end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size);
+                       xfs_bmap_last_offset(NULL, ip, &last_block,
+                               XFS_DATA_FORK);
+                       last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
+                       if ((map_start_fsb + count_fsb) > last_block) {
+                               count_fsb = last_block - map_start_fsb;
+                               if (count_fsb == 0) {
+                                       error = EAGAIN;
+                                       goto trans_cancel;
+                               }
+                       }
+
+                       /* Go get the actual blocks */
+                       error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
+                                       XFS_BMAPI_WRITE, &first_block, 1,
+                                       imap, &nimaps, &free_list);
+
+                       if (error)
+                               goto trans_cancel;
+
+                       error = xfs_bmap_finish(&tp, &free_list,
+                                       first_block, &committed);
+
+                       if (error)
+                               goto trans_cancel;
+
+                       error = xfs_trans_commit(tp,
+                                       XFS_TRANS_RELEASE_LOG_RES, NULL);
+
+                       if (error)
+                               goto error0;
+
+                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               }
+
+               /*
+                * See if we were able to allocate an extent that
+                * covers at least part of the callers request
+                */
+
+               for (i = 0; i < nimaps; i++) {
+                       if ((map->br_startoff >= imap[i].br_startoff) &&
+                           (map->br_startoff < (imap[i].br_startoff +
+                                                imap[i].br_blockcount))) {
+                               *map = imap[i];
+                               *retmap = 1;
+                               XFS_STATS_INC(xs_xstrat_quick);
+                               return 0;
+                       }
+                       count_fsb -= imap[i].br_blockcount;
+               }
+
+               /* So far we have not mapped the requested part of the
+                * file, just surrounding data, try again.
+                */
+               nimaps--;
+               offset_fsb = imap[nimaps].br_startoff +
+                            imap[nimaps].br_blockcount;
+               map_start_fsb = offset_fsb;
+       }
+
+trans_cancel:
+       xfs_bmap_cancel(&free_list);
+       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error0:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return XFS_ERROR(error);
+}
+
+int
+xfs_iomap_write_unwritten(
+       xfs_inode_t     *ip,
+       loff_t          offset,
+       size_t          count)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_trans_t     *tp;
+       xfs_fileoff_t   offset_fsb;
+       xfs_filblks_t   count_fsb;
+       xfs_filblks_t   numblks_fsb;
+       xfs_bmbt_irec_t imap;
+       int             committed;
+       int             error;
+       int             nres;
+       int             nimaps;
+       xfs_fsblock_t   firstfsb;
+       xfs_bmap_free_t free_list;
+
+       offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       count_fsb = XFS_B_TO_FSB(mp, count);
+
+       do {
+               nres = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+
+               /*
+                * set up a transaction to convert the range of extents
+                * from unwritten to real. Do allocations in a loop until
+                * we have covered the range passed in.
+                */
+
+               tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+               error = xfs_trans_reserve(tp, nres,
+                               XFS_WRITE_LOG_RES(mp), 0,
+                               XFS_TRANS_PERM_LOG_RES,
+                               XFS_WRITE_LOG_COUNT);
+               if (error) {
+                       xfs_trans_cancel(tp, 0);
+                       goto error0;
+               }
+
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+               xfs_trans_ihold(tp, ip);
+
+               /*
+                * Modify the unwritten extent state of the buffer.
+                */
+               XFS_BMAP_INIT(&free_list, &firstfsb);
+               nimaps = 1;
+               error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+                                 XFS_BMAPI_WRITE, &firstfsb,
+                                 1, &imap, &nimaps, &free_list);
+               if (error)
+                       goto error_on_bmapi_transaction;
+
+               error = xfs_bmap_finish(&(tp), &(free_list),
+                               firstfsb, &committed);
+               if (error)
+                       goto error_on_bmapi_transaction;
+
+               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               if (error)
+                       goto error0;
+
+               if ((numblks_fsb = imap.br_blockcount) == 0) {
+                       /*
+                        * The numblks_fsb value should always get
+                        * smaller, otherwise the loop is stuck.
+                        */
+                       ASSERT(imap.br_blockcount);
+                       break;
+               }
+               offset_fsb += numblks_fsb;
+               count_fsb -= numblks_fsb;
+       } while (count_fsb > 0);
+
+       return 0;
+
+error_on_bmapi_transaction:
+       xfs_bmap_cancel(&free_list);
+       xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+error0:
+       return XFS_ERROR(error);
+}