# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
#
-EXTRA_CFLAGS += -Ifs/xfs -funsigned-char
+EXTRA_CFLAGS += -Ifs/xfs -Ifs/xfs/linux -funsigned-char
ifeq ($(CONFIG_XFS_DEBUG),y)
EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG -DXFSDEBUG
xfs_alloc_btree.o \
xfs_attr.o \
xfs_attr_leaf.o \
+ xfs_behavior.o \
xfs_bit.o \
xfs_bmap.o \
xfs_bmap_btree.o \
xfs_inode.o \
xfs_inode_item.o \
xfs_iocore.o \
+ xfs_iomap.o \
xfs_itable.o \
xfs_dfrag.o \
xfs_log.o \
xfs-$(CONFIG_XFS_TRACE) += xfs_dir2_trace.o
-# Objects in pagebuf/
-xfs-y += pagebuf/page_buf.o
-
# Objects in linux/
xfs-y += $(addprefix linux/, \
+ mrlock.o \
xfs_aops.o \
- xfs_behavior.o \
+ xfs_buf.o \
xfs_file.o \
xfs_fs_subr.o \
xfs_globals.o \
xfs_ioctl.o \
- xfs_iomap.o \
xfs_iops.o \
xfs_lrw.o \
xfs_super.o \
xfs-y += $(addprefix support/, \
debug.o \
move.o \
- mrlock.o \
qsort.o \
uuid.o)
--- /dev/null
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_KMEM_H__
+#define __XFS_SUPPORT_KMEM_H__
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/*
+ * Cutoff point to use vmalloc instead of kmalloc.
+ */
+#define MAX_SLAB_SIZE 0x10000
+
+/*
+ * XFS uses slightly different names for these due to the
+ * IRIX heritage.
+ */
+#define kmem_zone kmem_cache_s
+#define kmem_zone_t kmem_cache_t
+
+#define KM_SLEEP 0x0001
+#define KM_NOSLEEP 0x0002
+#define KM_NOFS 0x0004
+
+typedef unsigned long xfs_pflags_t;
+
+#define PFLAGS_TEST_FSTRANS() (current->flags & PF_FSTRANS)
+
+#define PFLAGS_SET_FSTRANS(STATEP) do { \
+ *(STATEP) = current->flags; \
+ current->flags |= PF_FSTRANS; \
+} while (0)
+
+#define PFLAGS_RESTORE(STATEP) do { \
+ current->flags = *(STATEP); \
+} while (0)
+
+#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \
+ *(NSTATEP) = *(OSTATEP); \
+} while (0)
+
+/*
+ * XXX get rid of the unconditional __GFP_NOFAIL by adding
+ * a KM_FAIL flag and using it where we're allowed to fail.
+ */
+static __inline unsigned int
+kmem_flags_convert(int flags)
+{
+ int lflags;
+
+#if DEBUG
+ if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS))) {
+ printk(KERN_WARNING
+ "XFS: memory allocation with wrong flags (%x)\n", flags);
+ BUG();
+ }
+#endif
+
+ lflags = (flags & KM_NOSLEEP) ? GFP_ATOMIC : (GFP_KERNEL|__GFP_NOFAIL);
+
+ /* avoid recusive callbacks to filesystem during transactions */
+ if (PFLAGS_TEST_FSTRANS())
+ lflags &= ~__GFP_FS;
+
+ return lflags;
+}
+
+static __inline void *
+kmem_alloc(size_t size, int flags)
+{
+ if (unlikely(MAX_SLAB_SIZE < size))
+ /* Avoid doing filesystem sensitive stuff to get this */
+ return __vmalloc(size, kmem_flags_convert(flags), PAGE_KERNEL);
+ return kmalloc(size, kmem_flags_convert(flags));
+}
+
+static __inline void *
+kmem_zalloc(size_t size, int flags)
+{
+ void *ptr = kmem_alloc(size, flags);
+ if (likely(ptr != NULL))
+ memset(ptr, 0, size);
+ return ptr;
+}
+
+static __inline void
+kmem_free(void *ptr, size_t size)
+{
+ if (unlikely((unsigned long)ptr < VMALLOC_START ||
+ (unsigned long)ptr >= VMALLOC_END))
+ kfree(ptr);
+ else
+ vfree(ptr);
+}
+
+static __inline void *
+kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags)
+{
+ void *new = kmem_alloc(newsize, flags);
+
+ if (likely(ptr != NULL)) {
+ if (likely(new != NULL))
+ memcpy(new, ptr, min(oldsize, newsize));
+ kmem_free(ptr, oldsize);
+ }
+
+ return new;
+}
+
+static __inline kmem_zone_t *
+kmem_zone_init(int size, char *zone_name)
+{
+ return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL);
+}
+
+static __inline void *
+kmem_zone_alloc(kmem_zone_t *zone, int flags)
+{
+ return kmem_cache_alloc(zone, kmem_flags_convert(flags));
+}
+
+static __inline void *
+kmem_zone_zalloc(kmem_zone_t *zone, int flags)
+{
+ void *ptr = kmem_zone_alloc(zone, flags);
+ if (likely(ptr != NULL))
+ memset(ptr, 0, kmem_cache_size(zone));
+ return ptr;
+}
+
+static __inline void
+kmem_zone_free(kmem_zone_t *zone, void *ptr)
+{
+ kmem_cache_free(zone, ptr);
+}
+
+typedef struct shrinker *kmem_shaker_t;
+typedef int (*kmem_shake_func_t)(int, unsigned int);
+
+static __inline kmem_shaker_t
+kmem_shake_register(kmem_shake_func_t sfunc)
+{
+ return set_shrinker(DEFAULT_SEEKS, sfunc);
+}
+
+static __inline void
+kmem_shake_deregister(kmem_shaker_t shrinker)
+{
+ remove_shrinker(shrinker);
+}
+
+static __inline int
+kmem_shake_allow(unsigned int gfp_mask)
+{
+ return (gfp_mask & __GFP_WAIT);
+}
+
+#endif /* __XFS_SUPPORT_KMEM_H__ */
--- /dev/null
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <asm/system.h>
+#include <linux/interrupt.h>
+#include <asm/current.h>
+
+#include "mrlock.h"
+
+
+#if USE_RW_WAIT_QUEUE_SPINLOCK
+# define wq_write_lock write_lock
+#else
+# define wq_write_lock spin_lock
+#endif
+
+/*
+ * We don't seem to need lock_type (only one supported), name, or
+ * sequence. But, XFS will pass it so let's leave them here for now.
+ */
+/* ARGSUSED */
+void
+mrlock_init(mrlock_t *mrp, int lock_type, char *name, long sequence)
+{
+ mrp->mr_count = 0;
+ mrp->mr_reads_waiting = 0;
+ mrp->mr_writes_waiting = 0;
+ init_waitqueue_head(&mrp->mr_readerq);
+ init_waitqueue_head(&mrp->mr_writerq);
+ mrp->mr_lock = SPIN_LOCK_UNLOCKED;
+}
+
+/*
+ * Macros to lock/unlock the mrlock_t.
+ */
+
+#define MRLOCK(m) spin_lock(&(m)->mr_lock);
+#define MRUNLOCK(m) spin_unlock(&(m)->mr_lock);
+
+
+/*
+ * lock_wait should never be called in an interrupt thread.
+ *
+ * mrlocks can sleep (i.e. call schedule) and so they can't ever
+ * be called from an interrupt thread.
+ *
+ * threads that wake-up should also never be invoked from interrupt threads.
+ *
+ * But, waitqueue_lock is locked from interrupt threads - and we are
+ * called with interrupts disabled, so it is all OK.
+ */
+
+/* ARGSUSED */
+void
+lock_wait(wait_queue_head_t *q, spinlock_t *lock, int rw)
+{
+ DECLARE_WAITQUEUE( wait, current );
+
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+
+ spin_lock(&q->lock);
+ if (rw) {
+ __add_wait_queue_tail(q, &wait);
+ } else {
+ __add_wait_queue(q, &wait);
+ }
+
+ spin_unlock(&q->lock);
+ spin_unlock(lock);
+
+ schedule();
+
+ spin_lock(&q->lock);
+ __remove_wait_queue(q, &wait);
+ spin_unlock(&q->lock);
+
+ spin_lock(lock);
+
+ /* return with lock held */
+}
+
+/* ARGSUSED */
+void
+mrfree(mrlock_t *mrp)
+{
+}
+
+/* ARGSUSED */
+void
+mrlock(mrlock_t *mrp, int type, int flags)
+{
+ if (type == MR_ACCESS)
+ mraccess(mrp);
+ else
+ mrupdate(mrp);
+}
+
+/* ARGSUSED */
+void
+mraccessf(mrlock_t *mrp, int flags)
+{
+ MRLOCK(mrp);
+ if(mrp->mr_writes_waiting > 0) {
+ mrp->mr_reads_waiting++;
+ lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
+ mrp->mr_reads_waiting--;
+ }
+ while (mrp->mr_count < 0) {
+ mrp->mr_reads_waiting++;
+ lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
+ mrp->mr_reads_waiting--;
+ }
+ mrp->mr_count++;
+ MRUNLOCK(mrp);
+}
+
+/* ARGSUSED */
+void
+mrupdatef(mrlock_t *mrp, int flags)
+{
+ MRLOCK(mrp);
+ while(mrp->mr_count) {
+ mrp->mr_writes_waiting++;
+ lock_wait(&mrp->mr_writerq, &mrp->mr_lock, 1);
+ mrp->mr_writes_waiting--;
+ }
+
+ mrp->mr_count = -1; /* writer on it */
+ MRUNLOCK(mrp);
+}
+
+int
+mrtryaccess(mrlock_t *mrp)
+{
+ MRLOCK(mrp);
+ /*
+ * If anyone is waiting for update access or the lock is held for update
+ * fail the request.
+ */
+ if(mrp->mr_writes_waiting > 0 || mrp->mr_count < 0) {
+ MRUNLOCK(mrp);
+ return 0;
+ }
+ mrp->mr_count++;
+ MRUNLOCK(mrp);
+ return 1;
+}
+
+int
+mrtrypromote(mrlock_t *mrp)
+{
+ MRLOCK(mrp);
+
+ if(mrp->mr_count == 1) { /* We are the only thread with the lock */
+ mrp->mr_count = -1; /* writer on it */
+ MRUNLOCK(mrp);
+ return 1;
+ }
+
+ MRUNLOCK(mrp);
+ return 0;
+}
+
+int
+mrtryupdate(mrlock_t *mrp)
+{
+ MRLOCK(mrp);
+
+ if(mrp->mr_count) {
+ MRUNLOCK(mrp);
+ return 0;
+ }
+
+ mrp->mr_count = -1; /* writer on it */
+ MRUNLOCK(mrp);
+ return 1;
+}
+
+static __inline__ void mrwake(mrlock_t *mrp)
+{
+ /*
+ * First, if the count is now 0, we need to wake-up anyone waiting.
+ */
+ if (!mrp->mr_count) {
+ if (mrp->mr_writes_waiting) { /* Wake-up first writer waiting */
+ wake_up(&mrp->mr_writerq);
+ } else if (mrp->mr_reads_waiting) { /* Wakeup any readers waiting */
+ wake_up(&mrp->mr_readerq);
+ }
+ }
+}
+
+void
+mraccunlock(mrlock_t *mrp)
+{
+ MRLOCK(mrp);
+ mrp->mr_count--;
+ mrwake(mrp);
+ MRUNLOCK(mrp);
+}
+
+void
+mrunlock(mrlock_t *mrp)
+{
+ MRLOCK(mrp);
+ if (mrp->mr_count < 0) {
+ mrp->mr_count = 0;
+ } else {
+ mrp->mr_count--;
+ }
+ mrwake(mrp);
+ MRUNLOCK(mrp);
+}
+
+int
+ismrlocked(mrlock_t *mrp, int type) /* No need to lock since info can change */
+{
+ if (type == MR_ACCESS)
+ return (mrp->mr_count > 0); /* Read lock */
+ else if (type == MR_UPDATE)
+ return (mrp->mr_count < 0); /* Write lock */
+ else if (type == (MR_UPDATE | MR_ACCESS))
+ return (mrp->mr_count); /* Any type of lock held */
+ else /* Any waiters */
+ return (mrp->mr_reads_waiting | mrp->mr_writes_waiting);
+}
+
+/*
+ * Demote from update to access. We better be the only thread with the
+ * lock in update mode so it should be easy to set to 1.
+ * Wake-up any readers waiting.
+ */
+
+void
+mrdemote(mrlock_t *mrp)
+{
+ MRLOCK(mrp);
+ mrp->mr_count = 1;
+ if (mrp->mr_reads_waiting) { /* Wakeup all readers waiting */
+ wake_up(&mrp->mr_readerq);
+ }
+ MRUNLOCK(mrp);
+}
--- /dev/null
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_MRLOCK_H__
+#define __XFS_SUPPORT_MRLOCK_H__
+
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+/*
+ * Implement mrlocks on Linux that work for XFS.
+ *
+ * These are sleep locks and not spinlocks. If one wants read/write spinlocks,
+ * use read_lock, write_lock, ... see spinlock.h.
+ */
+
+typedef struct mrlock_s {
+ int mr_count;
+ unsigned short mr_reads_waiting;
+ unsigned short mr_writes_waiting;
+ wait_queue_head_t mr_readerq;
+ wait_queue_head_t mr_writerq;
+ spinlock_t mr_lock;
+} mrlock_t;
+
+#define MR_ACCESS 1
+#define MR_UPDATE 2
+
+#define MRLOCK_BARRIER 0x1
+#define MRLOCK_ALLOW_EQUAL_PRI 0x8
+
+/*
+ * mraccessf/mrupdatef take flags to be passed in while sleeping;
+ * only PLTWAIT is currently supported.
+ */
+
+extern void mraccessf(mrlock_t *, int);
+extern void mrupdatef(mrlock_t *, int);
+extern void mrlock(mrlock_t *, int, int);
+extern void mrunlock(mrlock_t *);
+extern void mraccunlock(mrlock_t *);
+extern int mrtryupdate(mrlock_t *);
+extern int mrtryaccess(mrlock_t *);
+extern int mrtrypromote(mrlock_t *);
+extern void mrdemote(mrlock_t *);
+
+extern int ismrlocked(mrlock_t *, int);
+extern void mrlock_init(mrlock_t *, int type, char *name, long sequence);
+extern void mrfree(mrlock_t *);
+
+#define mrinit(mrp, name) mrlock_init(mrp, MRLOCK_BARRIER, name, -1)
+#define mraccess(mrp) mraccessf(mrp, 0) /* grab for READ/ACCESS */
+#define mrupdate(mrp) mrupdatef(mrp, 0) /* grab for WRITE/UPDATE */
+#define mrislocked_access(mrp) ((mrp)->mr_count > 0)
+#define mrislocked_update(mrp) ((mrp)->mr_count < 0)
+
+#endif /* __XFS_SUPPORT_MRLOCK_H__ */
--- /dev/null
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_MUTEX_H__
+#define __XFS_SUPPORT_MUTEX_H__
+
+#include <linux/spinlock.h>
+#include <asm/semaphore.h>
+
+/*
+ * Map the mutex'es from IRIX to Linux semaphores.
+ *
+ * Destroy just simply initializes to -99 which should block all other
+ * callers.
+ */
+#define MUTEX_DEFAULT 0x0
+typedef struct semaphore mutex_t;
+
+#define mutex_init(lock, type, name) sema_init(lock, 1)
+#define mutex_destroy(lock) sema_init(lock, -99)
+#define mutex_lock(lock, num) down(lock)
+#define mutex_trylock(lock) (down_trylock(lock) ? 0 : 1)
+#define mutex_unlock(lock) up(lock)
+
+#endif /* __XFS_SUPPORT_MUTEX_H__ */
--- /dev/null
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SEMA_H__
+#define __XFS_SUPPORT_SEMA_H__
+
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+/*
+ * sema_t structure just maps to struct semaphore in Linux kernel.
+ */
+
+typedef struct semaphore sema_t;
+
+#define init_sema(sp, val, c, d) sema_init(sp, val)
+#define initsema(sp, val) sema_init(sp, val)
+#define initnsema(sp, val, name) sema_init(sp, val)
+#define psema(sp, b) down(sp)
+#define vsema(sp) up(sp)
+#define valusema(sp) (atomic_read(&(sp)->count))
+#define freesema(sema)
+
+/*
+ * Map cpsema (try to get the sema) to down_trylock. We need to switch
+ * the return values since cpsema returns 1 (acquired) 0 (failed) and
+ * down_trylock returns the reverse 0 (acquired) 1 (failed).
+ */
+
+#define cpsema(sp) (down_trylock(sp) ? 0 : 1)
+
+/*
+ * Didn't do cvsema(sp). Not sure how to map this to up/down/...
+ * It does a vsema if the values is < 0 other wise nothing.
+ */
+
+#endif /* __XFS_SUPPORT_SEMA_H__ */
--- /dev/null
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SPIN_H__
+#define __XFS_SUPPORT_SPIN_H__
+
+#include <linux/sched.h> /* preempt needs this */
+#include <linux/spinlock.h>
+
+/*
+ * Map lock_t from IRIX to Linux spinlocks.
+ *
+ * Note that linux turns on/off spinlocks depending on CONFIG_SMP.
+ * We don't need to worry about SMP or not here.
+ */
+
+#define SPLDECL(s) unsigned long s
+
+typedef spinlock_t lock_t;
+
+#define spinlock_init(lock, name) spin_lock_init(lock)
+#define spinlock_destroy(lock)
+
+static inline unsigned long mutex_spinlock(lock_t *lock)
+{
+ spin_lock(lock);
+ return 0;
+}
+
+/*ARGSUSED*/
+static inline void mutex_spinunlock(lock_t *lock, unsigned long s)
+{
+ spin_unlock(lock);
+}
+
+static inline void nested_spinlock(lock_t *lock)
+{
+ spin_lock(lock);
+}
+
+static inline void nested_spinunlock(lock_t *lock)
+{
+ spin_unlock(lock);
+}
+
+#endif /* __XFS_SUPPORT_SPIN_H__ */
--- /dev/null
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SV_H__
+#define __XFS_SUPPORT_SV_H__
+
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+/*
+ * Synchronisation variables.
+ *
+ * (Parameters "pri", "svf" and "rts" are not implemented)
+ */
+
+typedef struct sv_s {
+ wait_queue_head_t waiters;
+} sv_t;
+
+#define SV_FIFO 0x0 /* sv_t is FIFO type */
+#define SV_LIFO 0x2 /* sv_t is LIFO type */
+#define SV_PRIO 0x4 /* sv_t is PRIO type */
+#define SV_KEYED 0x6 /* sv_t is KEYED type */
+#define SV_DEFAULT SV_FIFO
+
+
+static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
+ unsigned long timeout)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue_exclusive(&sv->waiters, &wait);
+ __set_current_state(state);
+ spin_unlock(lock);
+
+ schedule_timeout(timeout);
+
+ remove_wait_queue(&sv->waiters, &wait);
+}
+
+#define init_sv(sv,type,name,flag) \
+ init_waitqueue_head(&(sv)->waiters)
+#define sv_init(sv,flag,name) \
+ init_waitqueue_head(&(sv)->waiters)
+#define sv_destroy(sv) \
+ /*NOTHING*/
+#define sv_wait(sv, pri, lock, s) \
+ _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+#define sv_wait_sig(sv, pri, lock, s) \
+ _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
+ _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
+#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
+ _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
+#define sv_signal(sv) \
+ wake_up(&(sv)->waiters)
+#define sv_broadcast(sv) \
+ wake_up_all(&(sv)->waiters)
+
+#endif /* __XFS_SUPPORT_SV_H__ */
--- /dev/null
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_TIME_H__
+#define __XFS_SUPPORT_TIME_H__
+
+#include <linux/sched.h>
+#include <linux/time.h>
+
+typedef struct timespec timespec_t;
+
+static inline void delay(long ticks)
+{
+ current->state = TASK_UNINTERRUPTIBLE;
+ schedule_timeout(ticks);
+}
+
+static inline void nanotime(struct timespec *tvp)
+{
+ *tvp = CURRENT_TIME;
+}
+
+#endif /* __XFS_SUPPORT_TIME_H__ */
+++ /dev/null
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- *
- */
-#include "xfs.h"
-
-/*
- * Source file used to associate/disassociate behaviors with virtualized
- * objects. See xfs_behavior.h for more information about behaviors, etc.
- *
- * The implementation is split between functions in this file and macros
- * in xfs_behavior.h.
- */
-
-/*
- * Insert a new behavior descriptor into a behavior chain.
- *
- * The behavior chain is ordered based on the 'position' number which
- * lives in the first field of the ops vector (higher numbers first).
- *
- * Attemps to insert duplicate ops result in an EINVAL return code.
- * Otherwise, return 0 to indicate success.
- */
-int
-bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp)
-{
- bhv_desc_t *curdesc, *prev;
- int position;
-
- /*
- * Validate the position value of the new behavior.
- */
- position = BHV_POSITION(bdp);
- ASSERT(position >= BHV_POSITION_BASE && position <= BHV_POSITION_TOP);
-
- /*
- * Find location to insert behavior. Check for duplicates.
- */
- prev = NULL;
- for (curdesc = bhp->bh_first;
- curdesc != NULL;
- curdesc = curdesc->bd_next) {
-
- /* Check for duplication. */
- if (curdesc->bd_ops == bdp->bd_ops) {
- ASSERT(0);
- return EINVAL;
- }
-
- /* Find correct position */
- if (position >= BHV_POSITION(curdesc)) {
- ASSERT(position != BHV_POSITION(curdesc));
- break; /* found it */
- }
-
- prev = curdesc;
- }
-
- if (prev == NULL) {
- /* insert at front of chain */
- bdp->bd_next = bhp->bh_first;
- bhp->bh_first = bdp;
- } else {
- /* insert after prev */
- bdp->bd_next = prev->bd_next;
- prev->bd_next = bdp;
- }
-
- return 0;
-}
-
-/*
- * Remove a behavior descriptor from a position in a behavior chain;
- * the postition is guaranteed not to be the first position.
- * Should only be called by the bhv_remove() macro.
- */
-void
-bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp)
-{
- bhv_desc_t *curdesc, *prev;
-
- ASSERT(bhp->bh_first != NULL);
- ASSERT(bhp->bh_first->bd_next != NULL);
-
- prev = bhp->bh_first;
- for (curdesc = bhp->bh_first->bd_next;
- curdesc != NULL;
- curdesc = curdesc->bd_next) {
-
- if (curdesc == bdp)
- break; /* found it */
- prev = curdesc;
- }
-
- ASSERT(curdesc == bdp);
- prev->bd_next = bdp->bd_next; /* remove from after prev */
-}
-
-/*
- * Look for a specific ops vector on the specified behavior chain.
- * Return the associated behavior descriptor. Or NULL, if not found.
- */
-bhv_desc_t *
-bhv_lookup(bhv_head_t *bhp, void *ops)
-{
- bhv_desc_t *curdesc;
-
- for (curdesc = bhp->bh_first;
- curdesc != NULL;
- curdesc = curdesc->bd_next) {
-
- if (curdesc->bd_ops == ops)
- return curdesc;
- }
-
- return NULL;
-}
-
-/*
- * Looks for the first behavior within a specified range of positions.
- * Return the associated behavior descriptor. Or NULL, if none found.
- */
-bhv_desc_t *
-bhv_lookup_range(bhv_head_t *bhp, int low, int high)
-{
- bhv_desc_t *curdesc;
-
- for (curdesc = bhp->bh_first;
- curdesc != NULL;
- curdesc = curdesc->bd_next) {
-
- int position = BHV_POSITION(curdesc);
-
- if (position <= high) {
- if (position >= low)
- return curdesc;
- return NULL;
- }
- }
-
- return NULL;
-}
-
-/*
- * Return the base behavior in the chain, or NULL if the chain
- * is empty.
- *
- * The caller has not read locked the behavior chain, so acquire the
- * lock before traversing the chain.
- */
-bhv_desc_t *
-bhv_base(bhv_head_t *bhp)
-{
- bhv_desc_t *curdesc;
-
- for (curdesc = bhp->bh_first;
- curdesc != NULL;
- curdesc = curdesc->bd_next) {
-
- if (curdesc->bd_next == NULL) {
- return curdesc;
- }
- }
-
- return NULL;
-}
-
-void
-bhv_head_init(
- bhv_head_t *bhp,
- char *name)
-{
- bhp->bh_first = NULL;
-}
-
-void
-bhv_insert_initial(
- bhv_head_t *bhp,
- bhv_desc_t *bdp)
-{
- ASSERT(bhp->bh_first == NULL);
- (bhp)->bh_first = bdp;
-}
-
-void
-bhv_head_destroy(
- bhv_head_t *bhp)
-{
- ASSERT(bhp->bh_first == NULL);
-}
+++ /dev/null
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_BEHAVIOR_H__
-#define __XFS_BEHAVIOR_H__
-
-/*
- * Header file used to associate behaviors with virtualized objects.
- *
- * A virtualized object is an internal, virtualized representation of
- * OS entities such as persistent files, processes, or sockets. Examples
- * of virtualized objects include vnodes, vprocs, and vsockets. Often
- * a virtualized object is referred to simply as an "object."
- *
- * A behavior is essentially an implementation layer associated with
- * an object. Multiple behaviors for an object are chained together,
- * the order of chaining determining the order of invocation. Each
- * behavior of a given object implements the same set of interfaces
- * (e.g., the VOP interfaces).
- *
- * Behaviors may be dynamically inserted into an object's behavior chain,
- * such that the addition is transparent to consumers that already have
- * references to the object. Typically, a given behavior will be inserted
- * at a particular location in the behavior chain. Insertion of new
- * behaviors is synchronized with operations-in-progress (oip's) so that
- * the oip's always see a consistent view of the chain.
- *
- * The term "interpostion" is used to refer to the act of inserting
- * a behavior such that it interposes on (i.e., is inserted in front
- * of) a particular other behavior. A key example of this is when a
- * system implementing distributed single system image wishes to
- * interpose a distribution layer (providing distributed coherency)
- * in front of an object that is otherwise only accessed locally.
- *
- * Note that the traditional vnode/inode combination is simply a virtualized
- * object that has exactly one associated behavior.
- *
- * Behavior synchronization is logic which is necessary under certain
- * circumstances that there is no conflict between ongoing operations
- * traversing the behavior chain and those dunamically modifying the
- * behavior chain. Because behavior synchronization adds extra overhead
- * to virtual operation invocation, we want to restrict, as much as
- * we can, the requirement for this extra code, to those situations
- * in which it is truly necessary.
- *
- * Behavior synchronization is needed whenever there's at least one class
- * of object in the system for which:
- * 1) multiple behaviors for a given object are supported,
- * -- AND --
- * 2a) insertion of a new behavior can happen dynamically at any time during
- * the life of an active object,
- * -- AND --
- * 3a) insertion of a new behavior needs to synchronize with existing
- * ops-in-progress.
- * -- OR --
- * 3b) multiple different behaviors can be dynamically inserted at
- * any time during the life of an active object
- * -- OR --
- * 3c) removal of a behavior can occur at any time during the life of
- * an active object.
- * -- OR --
- * 2b) removal of a behavior can occur at any time during the life of an
- * active object
- *
- */
-
-struct bhv_head_lock;
-
-/*
- * Behavior head. Head of the chain of behaviors.
- * Contained within each virtualized object data structure.
- */
-typedef struct bhv_head {
- struct bhv_desc *bh_first; /* first behavior in chain */
- struct bhv_head_lock *bh_lockp; /* pointer to lock info struct */
-} bhv_head_t;
-
-/*
- * Behavior descriptor. Descriptor associated with each behavior.
- * Contained within the behavior's private data structure.
- */
-typedef struct bhv_desc {
- void *bd_pdata; /* private data for this behavior */
- void *bd_vobj; /* virtual object associated with */
- void *bd_ops; /* ops for this behavior */
- struct bhv_desc *bd_next; /* next behavior in chain */
-} bhv_desc_t;
-
-/*
- * Behavior identity field. A behavior's identity determines the position
- * where it lives within a behavior chain, and it's always the first field
- * of the behavior's ops vector. The optional id field further identifies the
- * subsystem responsible for the behavior.
- */
-typedef struct bhv_identity {
- __u16 bi_id; /* owning subsystem id */
- __u16 bi_position; /* position in chain */
-} bhv_identity_t;
-
-typedef bhv_identity_t bhv_position_t;
-
-#define BHV_IDENTITY_INIT(id,pos) {id, pos}
-#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos)
-
-/*
- * Define boundaries of position values.
- */
-#define BHV_POSITION_INVALID 0 /* invalid position number */
-#define BHV_POSITION_BASE 1 /* base (last) implementation layer */
-#define BHV_POSITION_TOP 63 /* top (first) implementation layer */
-
-/*
- * Plumbing macros.
- */
-#define BHV_HEAD_FIRST(bhp) (ASSERT((bhp)->bh_first), (bhp)->bh_first)
-#define BHV_NEXT(bdp) (ASSERT((bdp)->bd_next), (bdp)->bd_next)
-#define BHV_NEXTNULL(bdp) ((bdp)->bd_next)
-#define BHV_VOBJ(bdp) (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj)
-#define BHV_VOBJNULL(bdp) ((bdp)->bd_vobj)
-#define BHV_PDATA(bdp) (bdp)->bd_pdata
-#define BHV_OPS(bdp) (bdp)->bd_ops
-#define BHV_IDENTITY(bdp) ((bhv_identity_t *)(bdp)->bd_ops)
-#define BHV_POSITION(bdp) (BHV_IDENTITY(bdp)->bi_position)
-
-extern void bhv_head_init(bhv_head_t *, char *);
-extern void bhv_head_destroy(bhv_head_t *);
-extern int bhv_insert(bhv_head_t *, bhv_desc_t *);
-extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *);
-
-/*
- * Initialize a new behavior descriptor.
- * Arguments:
- * bdp - pointer to behavior descriptor
- * pdata - pointer to behavior's private data
- * vobj - pointer to associated virtual object
- * ops - pointer to ops for this behavior
- */
-#define bhv_desc_init(bdp, pdata, vobj, ops) \
- { \
- (bdp)->bd_pdata = pdata; \
- (bdp)->bd_vobj = vobj; \
- (bdp)->bd_ops = ops; \
- (bdp)->bd_next = NULL; \
- }
-
-/*
- * Remove a behavior descriptor from a behavior chain.
- */
-#define bhv_remove(bhp, bdp) \
- { \
- if ((bhp)->bh_first == (bdp)) { \
- /* \
- * Remove from front of chain. \
- * Atomic wrt oip's. \
- */ \
- (bhp)->bh_first = (bdp)->bd_next; \
- } else { \
- /* remove from non-front of chain */ \
- bhv_remove_not_first(bhp, bdp); \
- } \
- (bdp)->bd_vobj = NULL; \
- }
-
-/*
- * Behavior module prototypes.
- */
-extern void bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp);
-extern bhv_desc_t * bhv_lookup(bhv_head_t *bhp, void *ops);
-extern bhv_desc_t * bhv_lookup_range(bhv_head_t *bhp, int low, int high);
-extern bhv_desc_t * bhv_base(bhv_head_t *bhp);
-
-/* No bhv locking on Linux */
-#define bhv_lookup_unlocked bhv_lookup
-#define bhv_base_unlocked bhv_base
-
-#endif /* __XFS_BEHAVIOR_H__ */
--- /dev/null
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * page_buf.c
+ *
+ * The page_buf module provides an abstract buffer cache model on top of
+ * the Linux page cache. Cached metadata blocks for a file system are
+ * hashed to the inode for the block device. The page_buf module
+ * assembles buffer (page_buf_t) objects on demand to aggregate such
+ * cached pages for I/O.
+ *
+ *
+ * Written by Steve Lord, Jim Mostek, Russell Cattelan
+ * and Rajagopal Ananthanarayanan ("ananth") at SGI.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/suspend.h>
+#include <linux/percpu.h>
+
+#include <support/ktrace.h>
+#include <support/debug.h>
+#include "kmem.h"
+
+#include "xfs_types.h"
+#include "xfs_cred.h"
+#include "xfs_lrw.h"
+#include "xfs_buf.h"
+
+#define BBSHIFT 9
+#define BN_ALIGN_MASK ((1 << (PAGE_CACHE_SHIFT - BBSHIFT)) - 1)
+
+#ifndef GFP_READAHEAD
+#define GFP_READAHEAD (__GFP_NOWARN|__GFP_NORETRY)
+#endif
+
+/*
+ * File wide globals
+ */
+
+STATIC kmem_cache_t *pagebuf_cache;
+STATIC void pagebuf_daemon_wakeup(int);
+STATIC void pagebuf_delwri_queue(page_buf_t *, int);
+STATIC struct workqueue_struct *pagebuf_logio_workqueue;
+STATIC struct workqueue_struct *pagebuf_dataio_workqueue;
+
+/*
+ * Pagebuf module configuration parameters, exported via
+ * /proc/sys/vm/pagebuf
+ */
+
+typedef struct pb_sysctl_val {
+ int min;
+ int val;
+ int max;
+} pb_sysctl_val_t;
+
+struct {
+ pb_sysctl_val_t flush_interval; /* interval between runs of the
+ * delwri flush daemon. */
+ pb_sysctl_val_t age_buffer; /* time for buffer to age before
+ * we flush it. */
+ pb_sysctl_val_t stats_clear; /* clear the pagebuf stats */
+ pb_sysctl_val_t debug; /* debug tracing on or off */
+} pb_params = {
+ /* MIN DFLT MAX */
+ .flush_interval = { HZ/2, HZ, 30*HZ },
+ .age_buffer = { 1*HZ, 15*HZ, 300*HZ },
+ .stats_clear = { 0, 0, 1 },
+ .debug = { 0, 0, 1 },
+};
+
+enum {
+ PB_FLUSH_INT = 1,
+ PB_FLUSH_AGE = 2,
+ PB_STATS_CLEAR = 3,
+ PB_DEBUG = 4,
+};
+
+/*
+ * Pagebuf statistics variables
+ */
+
+struct pbstats {
+ u_int32_t pb_get;
+ u_int32_t pb_create;
+ u_int32_t pb_get_locked;
+ u_int32_t pb_get_locked_waited;
+ u_int32_t pb_busy_locked;
+ u_int32_t pb_miss_locked;
+ u_int32_t pb_page_retries;
+ u_int32_t pb_page_found;
+ u_int32_t pb_get_read;
+} pbstats;
+DEFINE_PER_CPU(struct pbstats, pbstats);
+
+/* We don't disable preempt, not too worried about poking the
+ * wrong cpu's stat for now */
+#define PB_STATS_INC(count) (__get_cpu_var(pbstats).count++)
+
+/*
+ * Pagebuf debugging
+ */
+
+#ifdef PAGEBUF_TRACE
+void
+pagebuf_trace(
+ page_buf_t *pb,
+ char *id,
+ void *data,
+ void *ra)
+{
+ if (!pb_params.debug.val)
+ return;
+ ktrace_enter(pagebuf_trace_buf,
+ pb, id,
+ (void *)(unsigned long)pb->pb_flags,
+ (void *)(unsigned long)pb->pb_hold.counter,
+ (void *)(unsigned long)pb->pb_sema.count.counter,
+ (void *)current,
+ data, ra,
+ (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
+ (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
+ (void *)(unsigned long)pb->pb_buffer_length,
+ NULL, NULL, NULL, NULL, NULL);
+}
+ktrace_t *pagebuf_trace_buf;
+EXPORT_SYMBOL(pagebuf_trace_buf);
+#define PAGEBUF_TRACE_SIZE 4096
+#define PB_TRACE(pb, id, data) \
+ pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))
+#else
+#define PB_TRACE(pb, id, data) do { } while (0)
+#endif
+
+#ifdef PAGEBUF_LOCK_TRACKING
+# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid)
+# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1)
+# define PB_GET_OWNER(pb) ((pb)->pb_last_holder)
+#else
+# define PB_SET_OWNER(pb) do { } while (0)
+# define PB_CLEAR_OWNER(pb) do { } while (0)
+# define PB_GET_OWNER(pb) do { } while (0)
+#endif
+
+/*
+ * Pagebuf allocation / freeing.
+ */
+
+#define pb_to_gfp(flags) \
+ (((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \
+ ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL)
+
+#define pagebuf_allocate(flags) \
+ kmem_cache_alloc(pagebuf_cache, pb_to_gfp(flags))
+#define pagebuf_deallocate(pb) \
+ kmem_cache_free(pagebuf_cache, (pb));
+
+/*
+ * Pagebuf hashing
+ */
+
+#define NBITS 8
+#define NHASH (1<<NBITS)
+
+typedef struct {
+ struct list_head pb_hash;
+ int pb_count;
+ spinlock_t pb_hash_lock;
+} pb_hash_t;
+
+STATIC pb_hash_t pbhash[NHASH];
+#define pb_hash(pb) &pbhash[pb->pb_hash_index]
+
+STATIC int
+_bhash(
+ struct block_device *bdev,
+ loff_t base)
+{
+ int bit, hval;
+
+ base >>= 9;
+ base ^= (unsigned long)bdev / L1_CACHE_BYTES;
+ for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) {
+ hval ^= (int)base & (NHASH-1);
+ base >>= NBITS;
+ }
+ return hval;
+}
+
+/*
+ * Mapping of multi-page buffers into contiguous virtual space
+ */
+
+STATIC void *pagebuf_mapout_locked(page_buf_t *);
+
+typedef struct a_list {
+ void *vm_addr;
+ struct a_list *next;
+} a_list_t;
+
+STATIC a_list_t *as_free_head;
+STATIC int as_list_len;
+STATIC spinlock_t as_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Try to batch vunmaps because they are costly.
+ */
+STATIC void
+free_address(
+ void *addr)
+{
+ a_list_t *aentry;
+
+ aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC);
+ if (aentry) {
+ spin_lock(&as_lock);
+ aentry->next = as_free_head;
+ aentry->vm_addr = addr;
+ as_free_head = aentry;
+ as_list_len++;
+ spin_unlock(&as_lock);
+ } else {
+ vunmap(addr);
+ }
+}
+
+STATIC void
+purge_addresses(void)
+{
+ a_list_t *aentry, *old;
+
+ if (as_free_head == NULL)
+ return;
+
+ spin_lock(&as_lock);
+ aentry = as_free_head;
+ as_free_head = NULL;
+ as_list_len = 0;
+ spin_unlock(&as_lock);
+
+ while ((old = aentry) != NULL) {
+ vunmap(aentry->vm_addr);
+ aentry = aentry->next;
+ kfree(old);
+ }
+}
+
+/*
+ * Internal pagebuf object manipulation
+ */
+
+STATIC void
+_pagebuf_initialize(
+ page_buf_t *pb,
+ pb_target_t *target,
+ loff_t range_base,
+ size_t range_length,
+ page_buf_flags_t flags)
+{
+ /*
+ * We don't want certain flags to appear in pb->pb_flags.
+ */
+ flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD);
+
+ memset(pb, 0, sizeof(page_buf_t));
+ atomic_set(&pb->pb_hold, 1);
+ init_MUTEX_LOCKED(&pb->pb_iodonesema);
+ INIT_LIST_HEAD(&pb->pb_list);
+ INIT_LIST_HEAD(&pb->pb_hash_list);
+ init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
+ PB_SET_OWNER(pb);
+ pb->pb_target = target;
+ pb->pb_file_offset = range_base;
+ /*
+ * Set buffer_length and count_desired to the same value initially.
+ * IO routines should use count_desired, which will be the same in
+ * most cases but may be reset (e.g. XFS recovery).
+ */
+ pb->pb_buffer_length = pb->pb_count_desired = range_length;
+ pb->pb_flags = flags | PBF_NONE;
+ pb->pb_bn = PAGE_BUF_DADDR_NULL;
+ atomic_set(&pb->pb_pin_count, 0);
+ init_waitqueue_head(&pb->pb_waiters);
+
+ PB_STATS_INC(pb_create);
+ PB_TRACE(pb, "initialize", target);
+}
+
+/*
+ * Allocate a page array capable of holding a specified number
+ * of pages, and point the page buf at it.
+ */
+STATIC int
+_pagebuf_get_pages(
+ page_buf_t *pb,
+ int page_count,
+ page_buf_flags_t flags)
+{
+ int gpf_mask = pb_to_gfp(flags);
+
+ /* Make sure that we have a page list */
+ if (pb->pb_pages == NULL) {
+ pb->pb_offset = page_buf_poff(pb->pb_file_offset);
+ pb->pb_page_count = page_count;
+ if (page_count <= PB_PAGES) {
+ pb->pb_pages = pb->pb_page_array;
+ } else {
+ pb->pb_pages = kmalloc(sizeof(struct page *) *
+ page_count, gpf_mask);
+ if (pb->pb_pages == NULL)
+ return -ENOMEM;
+ }
+ memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
+ }
+ return 0;
+}
+
+/*
+ * Walk a pagebuf releasing all the pages contained within it.
+ */
+STATIC inline void
+_pagebuf_freepages(
+ page_buf_t *pb)
+{
+ int buf_index;
+
+ for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) {
+ struct page *page = pb->pb_pages[buf_index];
+
+ if (page) {
+ pb->pb_pages[buf_index] = NULL;
+ page_cache_release(page);
+ }
+ }
+}
+
+/*
+ * _pagebuf_free_object
+ *
+ * _pagebuf_free_object releases the contents specified buffer.
+ * The modification state of any associated pages is left unchanged.
+ */
+void
+_pagebuf_free_object(
+ pb_hash_t *hash, /* hash bucket for buffer */
+ page_buf_t *pb) /* buffer to deallocate */
+{
+ page_buf_flags_t pb_flags = pb->pb_flags;
+
+ PB_TRACE(pb, "free_object", 0);
+ pb->pb_flags |= PBF_FREED;
+
+ if (hash) {
+ if (!list_empty(&pb->pb_hash_list)) {
+ hash->pb_count--;
+ list_del_init(&pb->pb_hash_list);
+ }
+ spin_unlock(&hash->pb_hash_lock);
+ }
+
+ if (!(pb_flags & PBF_FREED)) {
+ /* release any virtual mapping */ ;
+ if (pb->pb_flags & _PBF_ADDR_ALLOCATED) {
+ void *vaddr = pagebuf_mapout_locked(pb);
+ if (vaddr) {
+ free_address(vaddr);
+ }
+ }
+
+ if (pb->pb_flags & _PBF_MEM_ALLOCATED) {
+ if (pb->pb_pages) {
+ /* release the pages in the address list */
+ if ((pb->pb_pages[0]) &&
+ (pb->pb_flags & _PBF_MEM_SLAB)) {
+ kfree(pb->pb_addr);
+ } else {
+ _pagebuf_freepages(pb);
+ }
+ if (pb->pb_pages != pb->pb_page_array)
+ kfree(pb->pb_pages);
+ pb->pb_pages = NULL;
+ }
+ pb->pb_flags &= ~(_PBF_MEM_ALLOCATED|_PBF_MEM_SLAB);
+ }
+ }
+
+ pagebuf_deallocate(pb);
+}
+
+/*
+ * _pagebuf_lookup_pages
+ *
+ * _pagebuf_lookup_pages finds all pages which match the buffer
+ * in question and the range of file offsets supplied,
+ * and builds the page list for the buffer, if the
+ * page list is not already formed or if not all of the pages are
+ * already in the list. Invalid pages (pages which have not yet been
+ * read in from disk) are assigned for any pages which are not found.
+ */
+STATIC int
+_pagebuf_lookup_pages(
+ page_buf_t *pb,
+ struct address_space *aspace,
+ page_buf_flags_t flags)
+{
+ loff_t next_buffer_offset;
+ unsigned long page_count, pi, index;
+ struct page *page;
+ int gfp_mask, retry_count = 5, rval = 0;
+ int all_mapped, good_pages, nbytes;
+ unsigned int blocksize, sectorshift;
+ size_t size, offset;
+
+
+ /* For pagebufs where we want to map an address, do not use
+ * highmem pages - so that we do not need to use kmap resources
+ * to access the data.
+ *
+ * For pages where the caller has indicated there may be resource
+ * contention (e.g. called from a transaction) do not flush
+ * delalloc pages to obtain memory.
+ */
+
+ if (flags & PBF_READ_AHEAD) {
+ gfp_mask = GFP_READAHEAD;
+ retry_count = 0;
+ } else if (flags & PBF_DONT_BLOCK) {
+ gfp_mask = GFP_NOFS;
+ } else if (flags & PBF_MAPPABLE) {
+ gfp_mask = GFP_KERNEL;
+ } else {
+ gfp_mask = GFP_HIGHUSER;
+ }
+
+ next_buffer_offset = pb->pb_file_offset + pb->pb_buffer_length;
+
+ good_pages = page_count = (page_buf_btoc(next_buffer_offset) -
+ page_buf_btoct(pb->pb_file_offset));
+
+ if (pb->pb_flags & _PBF_ALL_PAGES_MAPPED) {
+ /* Bring pages forward in cache */
+ for (pi = 0; pi < page_count; pi++) {
+ mark_page_accessed(pb->pb_pages[pi]);
+ }
+ if ((flags & PBF_MAPPED) && !(pb->pb_flags & PBF_MAPPED)) {
+ all_mapped = 1;
+ goto mapit;
+ }
+ return 0;
+ }
+
+ /* Ensure pb_pages field has been initialised */
+ rval = _pagebuf_get_pages(pb, page_count, flags);
+ if (rval)
+ return rval;
+
+ rval = pi = 0;
+ blocksize = pb->pb_target->pbr_bsize;
+ sectorshift = pb->pb_target->pbr_sshift;
+ size = pb->pb_count_desired;
+ offset = pb->pb_offset;
+
+ /* Enter the pages in the page list */
+ index = (pb->pb_file_offset - pb->pb_offset) >> PAGE_CACHE_SHIFT;
+ for (all_mapped = 1; pi < page_count; pi++, index++) {
+ if (pb->pb_pages[pi] == 0) {
+ retry:
+ page = find_or_create_page(aspace, index, gfp_mask);
+ if (!page) {
+ if (--retry_count > 0) {
+ PB_STATS_INC(pb_page_retries);
+ pagebuf_daemon_wakeup(1);
+ current->state = TASK_UNINTERRUPTIBLE;
+ schedule_timeout(10);
+ goto retry;
+ }
+ rval = -ENOMEM;
+ all_mapped = 0;
+ continue;
+ }
+ PB_STATS_INC(pb_page_found);
+ mark_page_accessed(page);
+ pb->pb_pages[pi] = page;
+ } else {
+ page = pb->pb_pages[pi];
+ lock_page(page);
+ }
+
+ nbytes = PAGE_CACHE_SIZE - offset;
+ if (nbytes > size)
+ nbytes = size;
+ size -= nbytes;
+
+ if (!PageUptodate(page)) {
+ if (blocksize == PAGE_CACHE_SIZE) {
+ if (flags & PBF_READ)
+ pb->pb_locked = 1;
+ good_pages--;
+ } else if (!PagePrivate(page)) {
+ unsigned long i, range;
+
+ /*
+ * In this case page->private holds a bitmap
+ * of uptodate sectors within the page
+ */
+ ASSERT(blocksize < PAGE_CACHE_SIZE);
+ range = (offset + nbytes) >> sectorshift;
+ for (i = offset >> sectorshift; i < range; i++)
+ if (!test_bit(i, &page->private))
+ break;
+ if (i != range)
+ good_pages--;
+ } else {
+ good_pages--;
+ }
+ }
+ offset = 0;
+ }
+
+ if (!pb->pb_locked) {
+ for (pi = 0; pi < page_count; pi++) {
+ if (pb->pb_pages[pi])
+ unlock_page(pb->pb_pages[pi]);
+ }
+ }
+
+mapit:
+ pb->pb_flags |= _PBF_MEM_ALLOCATED;
+ if (all_mapped) {
+ pb->pb_flags |= _PBF_ALL_PAGES_MAPPED;
+
+ /* A single page buffer is always mappable */
+ if (page_count == 1) {
+ pb->pb_addr = (caddr_t)
+ page_address(pb->pb_pages[0]) + pb->pb_offset;
+ pb->pb_flags |= PBF_MAPPED;
+ } else if (flags & PBF_MAPPED) {
+ if (as_list_len > 64)
+ purge_addresses();
+ pb->pb_addr = vmap(pb->pb_pages, page_count,
+ VM_MAP, PAGE_KERNEL);
+ if (pb->pb_addr == NULL)
+ return -ENOMEM;
+ pb->pb_addr += pb->pb_offset;
+ pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED;
+ }
+ }
+ /* If some pages were found with data in them
+ * we are not in PBF_NONE state.
+ */
+ if (good_pages != 0) {
+ pb->pb_flags &= ~(PBF_NONE);
+ if (good_pages != page_count) {
+ pb->pb_flags |= PBF_PARTIAL;
+ }
+ }
+
+ PB_TRACE(pb, "lookup_pages", (long)good_pages);
+
+ return rval;
+}
+
+/*
+ * Finding and Reading Buffers
+ */
+
+/*
+ * _pagebuf_find
+ *
+ * Looks up, and creates if absent, a lockable buffer for
+ * a given range of an inode. The buffer is returned
+ * locked. If other overlapping buffers exist, they are
+ * released before the new buffer is created and locked,
+ * which may imply that this call will block until those buffers
+ * are unlocked. No I/O is implied by this call.
+ */
+STATIC page_buf_t *
+_pagebuf_find( /* find buffer for block */
+ pb_target_t *target,/* target for block */
+ loff_t ioff, /* starting offset of range */
+ size_t isize, /* length of range */
+ page_buf_flags_t flags, /* PBF_TRYLOCK */
+ page_buf_t *new_pb)/* newly allocated buffer */
+{
+ loff_t range_base;
+ size_t range_length;
+ int hval;
+ pb_hash_t *h;
+ struct list_head *p;
+ page_buf_t *pb;
+ int not_locked;
+
+ range_base = (ioff << BBSHIFT);
+ range_length = (isize << BBSHIFT);
+
+ /* Ensure we never do IOs smaller than the sector size */
+ BUG_ON(range_length < (1 << target->pbr_sshift));
+
+ /* Ensure we never do IOs that are not sector aligned */
+ BUG_ON(range_base & (loff_t)target->pbr_smask);
+
+ hval = _bhash(target->pbr_bdev, range_base);
+ h = &pbhash[hval];
+
+ spin_lock(&h->pb_hash_lock);
+ list_for_each(p, &h->pb_hash) {
+ pb = list_entry(p, page_buf_t, pb_hash_list);
+
+ if ((target == pb->pb_target) &&
+ (pb->pb_file_offset == range_base) &&
+ (pb->pb_buffer_length == range_length)) {
+ if (pb->pb_flags & PBF_FREED)
+ break;
+ /* If we look at something bring it to the
+ * front of the list for next time
+ */
+ list_del(&pb->pb_hash_list);
+ list_add(&pb->pb_hash_list, &h->pb_hash);
+ goto found;
+ }
+ }
+
+ /* No match found */
+ if (new_pb) {
+ _pagebuf_initialize(new_pb, target, range_base,
+ range_length, flags | _PBF_LOCKABLE);
+ new_pb->pb_hash_index = hval;
+ h->pb_count++;
+ list_add(&new_pb->pb_hash_list, &h->pb_hash);
+ } else {
+ PB_STATS_INC(pb_miss_locked);
+ }
+
+ spin_unlock(&h->pb_hash_lock);
+ return (new_pb);
+
+found:
+ atomic_inc(&pb->pb_hold);
+ spin_unlock(&h->pb_hash_lock);
+
+ /* Attempt to get the semaphore without sleeping,
+ * if this does not work then we need to drop the
+ * spinlock and do a hard attempt on the semaphore.
+ */
+ not_locked = down_trylock(&pb->pb_sema);
+ if (not_locked) {
+ if (!(flags & PBF_TRYLOCK)) {
+ /* wait for buffer ownership */
+ PB_TRACE(pb, "get_lock", 0);
+ pagebuf_lock(pb);
+ PB_STATS_INC(pb_get_locked_waited);
+ } else {
+ /* We asked for a trylock and failed, no need
+ * to look at file offset and length here, we
+ * know that this pagebuf at least overlaps our
+ * pagebuf and is locked, therefore our buffer
+ * either does not exist, or is this buffer
+ */
+
+ pagebuf_rele(pb);
+ PB_STATS_INC(pb_busy_locked);
+ return (NULL);
+ }
+ } else {
+ /* trylock worked */
+ PB_SET_OWNER(pb);
+ }
+
+ if (pb->pb_flags & PBF_STALE)
+ pb->pb_flags &= PBF_MAPPABLE | \
+ PBF_MAPPED | \
+ _PBF_LOCKABLE | \
+ _PBF_ALL_PAGES_MAPPED | \
+ _PBF_ADDR_ALLOCATED | \
+ _PBF_MEM_ALLOCATED | \
+ _PBF_MEM_SLAB;
+ PB_TRACE(pb, "got_lock", 0);
+ PB_STATS_INC(pb_get_locked);
+ return (pb);
+}
+
+
+/*
+ * pagebuf_find
+ *
+ * pagebuf_find returns a buffer matching the specified range of
+ * data for the specified target, if any of the relevant blocks
+ * are in memory. The buffer may have unallocated holes, if
+ * some, but not all, of the blocks are in memory. Even where
+ * pages are present in the buffer, not all of every page may be
+ * valid.
+ */
+page_buf_t *
+pagebuf_find( /* find buffer for block */
+ /* if the block is in memory */
+ pb_target_t *target,/* target for block */
+ loff_t ioff, /* starting offset of range */
+ size_t isize, /* length of range */
+ page_buf_flags_t flags) /* PBF_TRYLOCK */
+{
+ return _pagebuf_find(target, ioff, isize, flags, NULL);
+}
+
+/*
+ * pagebuf_get
+ *
+ * pagebuf_get assembles a buffer covering the specified range.
+ * Some or all of the blocks in the range may be valid. Storage
+ * in memory for all portions of the buffer will be allocated,
+ * although backing storage may not be. If PBF_READ is set in
+ * flags, pagebuf_iostart is called also.
+ */
+page_buf_t *
+pagebuf_get( /* allocate a buffer */
+ pb_target_t *target,/* target for buffer */
+ loff_t ioff, /* starting offset of range */
+ size_t isize, /* length of range */
+ page_buf_flags_t flags) /* PBF_TRYLOCK */
+{
+ page_buf_t *pb, *new_pb;
+ int error;
+
+ new_pb = pagebuf_allocate(flags);
+ if (unlikely(!new_pb))
+ return (NULL);
+
+ pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
+ if (pb != new_pb) {
+ pagebuf_deallocate(new_pb);
+ if (unlikely(!pb))
+ return (NULL);
+ }
+
+ PB_STATS_INC(pb_get);
+
+ /* fill in any missing pages */
+ error = _pagebuf_lookup_pages(pb, pb->pb_target->pbr_mapping, flags);
+ if (unlikely(error)) {
+ pagebuf_free(pb);
+ return (NULL);
+ }
+
+ /*
+ * Always fill in the block number now, the mapped cases can do
+ * their own overlay of this later.
+ */
+ pb->pb_bn = ioff;
+ pb->pb_count_desired = pb->pb_buffer_length;
+
+ if (flags & PBF_READ) {
+ if (PBF_NOT_DONE(pb)) {
+ PB_TRACE(pb, "get_read", (unsigned long)flags);
+ PB_STATS_INC(pb_get_read);
+ pagebuf_iostart(pb, flags);
+ } else if (flags & PBF_ASYNC) {
+ PB_TRACE(pb, "get_read_async", (unsigned long)flags);
+ /*
+ * Read ahead call which is already satisfied,
+ * drop the buffer
+ */
+ if (flags & (PBF_LOCK | PBF_TRYLOCK))
+ pagebuf_unlock(pb);
+ pagebuf_rele(pb);
+ return NULL;
+ } else {
+ PB_TRACE(pb, "get_read_done", (unsigned long)flags);
+ /* We do not want read in the flags */
+ pb->pb_flags &= ~PBF_READ;
+ }
+ } else {
+ PB_TRACE(pb, "get_write", (unsigned long)flags);
+ }
+ return (pb);
+}
+
+/*
+ * Create a skeletal pagebuf (no pages associated with it).
+ */
+page_buf_t *
+pagebuf_lookup(
+ struct pb_target *target,
+ loff_t ioff,
+ size_t isize,
+ page_buf_flags_t flags)
+{
+ page_buf_t *pb;
+
+ pb = pagebuf_allocate(flags);
+ if (pb) {
+ _pagebuf_initialize(pb, target, ioff, isize, flags);
+ }
+ return pb;
+}
+
+/*
+ * If we are not low on memory then do the readahead in a deadlock
+ * safe manner.
+ */
+void
+pagebuf_readahead(
+ pb_target_t *target,
+ loff_t ioff,
+ size_t isize,
+ page_buf_flags_t flags)
+{
+ struct backing_dev_info *bdi;
+
+ bdi = target->pbr_mapping->backing_dev_info;
+ if (bdi_read_congested(bdi))
+ return;
+ if (bdi_write_congested(bdi))
+ return;
+
+ flags |= (PBF_TRYLOCK|PBF_READ|PBF_ASYNC|PBF_MAPPABLE|PBF_READ_AHEAD);
+ pagebuf_get(target, ioff, isize, flags);
+}
+
+page_buf_t *
+pagebuf_get_empty(
+ size_t len,
+ pb_target_t *target)
+{
+ page_buf_t *pb;
+
+ pb = pagebuf_allocate(_PBF_LOCKABLE);
+ if (pb)
+ _pagebuf_initialize(pb, target, 0, len, _PBF_LOCKABLE);
+ return pb;
+}
+
+static inline struct page *
+mem_to_page(
+ void *addr)
+{
+ if (((unsigned long)addr < VMALLOC_START) ||
+ ((unsigned long)addr >= VMALLOC_END)) {
+ return virt_to_page(addr);
+ } else {
+ return vmalloc_to_page(addr);
+ }
+}
+
+int
+pagebuf_associate_memory(
+ page_buf_t *pb,
+ void *mem,
+ size_t len)
+{
+ int rval;
+ int i = 0;
+ size_t ptr;
+ size_t end, end_cur;
+ off_t offset;
+ int page_count;
+
+ page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+ offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
+ if (offset && (len > PAGE_CACHE_SIZE))
+ page_count++;
+
+ /* Free any previous set of page pointers */
+ if (pb->pb_pages && (pb->pb_pages != pb->pb_page_array)) {
+ kfree(pb->pb_pages);
+ }
+ pb->pb_pages = NULL;
+ pb->pb_addr = mem;
+
+ rval = _pagebuf_get_pages(pb, page_count, 0);
+ if (rval)
+ return rval;
+
+ pb->pb_offset = offset;
+ ptr = (size_t) mem & PAGE_CACHE_MASK;
+ end = PAGE_CACHE_ALIGN((size_t) mem + len);
+ end_cur = end;
+ /* set up first page */
+ pb->pb_pages[0] = mem_to_page(mem);
+
+ ptr += PAGE_CACHE_SIZE;
+ pb->pb_page_count = ++i;
+ while (ptr < end) {
+ pb->pb_pages[i] = mem_to_page((void *)ptr);
+ pb->pb_page_count = ++i;
+ ptr += PAGE_CACHE_SIZE;
+ }
+ pb->pb_locked = 0;
+
+ pb->pb_count_desired = pb->pb_buffer_length = len;
+ pb->pb_flags |= PBF_MAPPED;
+
+ return 0;
+}
+
+page_buf_t *
+pagebuf_get_no_daddr(
+ size_t len,
+ pb_target_t *target)
+{
+ int rval;
+ void *rmem = NULL;
+ page_buf_flags_t flags = _PBF_LOCKABLE | PBF_FORCEIO;
+ page_buf_t *pb;
+ size_t tlen = 0;
+
+ if (unlikely(len > 0x20000))
+ return NULL;
+
+ pb = pagebuf_allocate(flags);
+ if (!pb)
+ return NULL;
+
+ _pagebuf_initialize(pb, target, 0, len, flags);
+
+ do {
+ if (tlen == 0) {
+ tlen = len; /* first time */
+ } else {
+ kfree(rmem); /* free the mem from the previous try */
+ tlen <<= 1; /* double the size and try again */
+ }
+ if ((rmem = kmalloc(tlen, GFP_KERNEL)) == 0) {
+ pagebuf_free(pb);
+ return NULL;
+ }
+ } while ((size_t)rmem != ((size_t)rmem & ~target->pbr_smask));
+
+ if ((rval = pagebuf_associate_memory(pb, rmem, len)) != 0) {
+ kfree(rmem);
+ pagebuf_free(pb);
+ return NULL;
+ }
+ /* otherwise pagebuf_free just ignores it */
+ pb->pb_flags |= (_PBF_MEM_ALLOCATED | _PBF_MEM_SLAB);
+ PB_CLEAR_OWNER(pb);
+ up(&pb->pb_sema); /* Return unlocked pagebuf */
+
+ PB_TRACE(pb, "no_daddr", rmem);
+
+ return pb;
+}
+
+
+/*
+ * pagebuf_hold
+ *
+ * Increment reference count on buffer, to hold the buffer concurrently
+ * with another thread which may release (free) the buffer asynchronously.
+ *
+ * Must hold the buffer already to call this function.
+ */
+void
+pagebuf_hold(
+ page_buf_t *pb)
+{
+ atomic_inc(&pb->pb_hold);
+ PB_TRACE(pb, "hold", 0);
+}
+
+/*
+ * pagebuf_free
+ *
+ * pagebuf_free releases the specified buffer. The modification
+ * state of any associated pages is left unchanged.
+ */
+void
+pagebuf_free(
+ page_buf_t *pb)
+{
+ if (pb->pb_flags & _PBF_LOCKABLE) {
+ pb_hash_t *h = pb_hash(pb);
+
+ spin_lock(&h->pb_hash_lock);
+ _pagebuf_free_object(h, pb);
+ } else {
+ _pagebuf_free_object(NULL, pb);
+ }
+}
+
+/*
+ * pagebuf_rele
+ *
+ * pagebuf_rele releases a hold on the specified buffer. If the
+ * the hold count is 1, pagebuf_rele calls pagebuf_free.
+ */
+void
+pagebuf_rele(
+ page_buf_t *pb)
+{
+ pb_hash_t *h;
+
+ PB_TRACE(pb, "rele", pb->pb_relse);
+ if (pb->pb_flags & _PBF_LOCKABLE) {
+ h = pb_hash(pb);
+ spin_lock(&h->pb_hash_lock);
+ } else {
+ h = NULL;
+ }
+
+ if (atomic_dec_and_test(&pb->pb_hold)) {
+ int do_free = 1;
+
+ if (pb->pb_relse) {
+ atomic_inc(&pb->pb_hold);
+ if (h)
+ spin_unlock(&h->pb_hash_lock);
+ (*(pb->pb_relse)) (pb);
+ do_free = 0;
+ }
+ if (pb->pb_flags & PBF_DELWRI) {
+ pb->pb_flags |= PBF_ASYNC;
+ atomic_inc(&pb->pb_hold);
+ if (h && do_free)
+ spin_unlock(&h->pb_hash_lock);
+ pagebuf_delwri_queue(pb, 0);
+ do_free = 0;
+ } else if (pb->pb_flags & PBF_FS_MANAGED) {
+ if (h)
+ spin_unlock(&h->pb_hash_lock);
+ do_free = 0;
+ }
+
+ if (do_free) {
+ _pagebuf_free_object(h, pb);
+ }
+ } else if (h) {
+ spin_unlock(&h->pb_hash_lock);
+ }
+}
+
+
+/*
+ * Mutual exclusion on buffers. Locking model:
+ *
+ * Buffers associated with inodes for which buffer locking
+ * is not enabled are not protected by semaphores, and are
+ * assumed to be exclusively owned by the caller. There is a
+ * spinlock in the buffer, used by the caller when concurrent
+ * access is possible.
+ */
+
+/*
+ * pagebuf_cond_lock
+ *
+ * pagebuf_cond_lock locks a buffer object, if it is not already locked.
+ * Note that this in no way
+ * locks the underlying pages, so it is only useful for synchronizing
+ * concurrent use of page buffer objects, not for synchronizing independent
+ * access to the underlying pages.
+ */
+int
+pagebuf_cond_lock( /* lock buffer, if not locked */
+ /* returns -EBUSY if locked) */
+ page_buf_t *pb)
+{
+ int locked;
+
+ ASSERT(pb->pb_flags & _PBF_LOCKABLE);
+ locked = down_trylock(&pb->pb_sema) == 0;
+ if (locked) {
+ PB_SET_OWNER(pb);
+ }
+ PB_TRACE(pb, "cond_lock", (long)locked);
+ return(locked ? 0 : -EBUSY);
+}
+
+/*
+ * pagebuf_lock_value
+ *
+ * Return lock value for a pagebuf
+ */
+int
+pagebuf_lock_value(
+ page_buf_t *pb)
+{
+ ASSERT(pb->pb_flags & _PBF_LOCKABLE);
+ return(atomic_read(&pb->pb_sema.count));
+}
+
+/*
+ * pagebuf_lock
+ *
+ * pagebuf_lock locks a buffer object. Note that this in no way
+ * locks the underlying pages, so it is only useful for synchronizing
+ * concurrent use of page buffer objects, not for synchronizing independent
+ * access to the underlying pages.
+ */
+int
+pagebuf_lock(
+ page_buf_t *pb)
+{
+ ASSERT(pb->pb_flags & _PBF_LOCKABLE);
+
+ PB_TRACE(pb, "lock", 0);
+ if (atomic_read(&pb->pb_io_remaining))
+ blk_run_queues();
+ down(&pb->pb_sema);
+ PB_SET_OWNER(pb);
+ PB_TRACE(pb, "locked", 0);
+ return 0;
+}
+
+/*
+ * pagebuf_unlock
+ *
+ * pagebuf_unlock releases the lock on the buffer object created by
+ * pagebuf_lock or pagebuf_cond_lock (not any
+ * pinning of underlying pages created by pagebuf_pin).
+ */
+void
+pagebuf_unlock( /* unlock buffer */
+ page_buf_t *pb) /* buffer to unlock */
+{
+ ASSERT(pb->pb_flags & _PBF_LOCKABLE);
+ PB_CLEAR_OWNER(pb);
+ up(&pb->pb_sema);
+ PB_TRACE(pb, "unlock", 0);
+}
+
+
+/*
+ * Pinning Buffer Storage in Memory
+ */
+
+/*
+ * pagebuf_pin
+ *
+ * pagebuf_pin locks all of the memory represented by a buffer in
+ * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for
+ * the same or different buffers affecting a given page, will
+ * properly count the number of outstanding "pin" requests. The
+ * buffer may be released after the pagebuf_pin and a different
+ * buffer used when calling pagebuf_unpin, if desired.
+ * pagebuf_pin should be used by the file system when it wants be
+ * assured that no attempt will be made to force the affected
+ * memory to disk. It does not assure that a given logical page
+ * will not be moved to a different physical page.
+ */
+void
+pagebuf_pin(
+ page_buf_t *pb)
+{
+ atomic_inc(&pb->pb_pin_count);
+ PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
+}
+
+/*
+ * pagebuf_unpin
+ *
+ * pagebuf_unpin reverses the locking of memory performed by
+ * pagebuf_pin. Note that both functions affected the logical
+ * pages associated with the buffer, not the buffer itself.
+ */
+void
+pagebuf_unpin(
+ page_buf_t *pb)
+{
+ if (atomic_dec_and_test(&pb->pb_pin_count)) {
+ wake_up_all(&pb->pb_waiters);
+ }
+ PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
+}
+
+int
+pagebuf_ispin(
+ page_buf_t *pb)
+{
+ return atomic_read(&pb->pb_pin_count);
+}
+
+/*
+ * pagebuf_wait_unpin
+ *
+ * pagebuf_wait_unpin waits until all of the memory associated
+ * with the buffer is not longer locked in memory. It returns
+ * immediately if none of the affected pages are locked.
+ */
+static inline void
+_pagebuf_wait_unpin(
+ page_buf_t *pb)
+{
+ DECLARE_WAITQUEUE (wait, current);
+
+ if (atomic_read(&pb->pb_pin_count) == 0)
+ return;
+
+ add_wait_queue(&pb->pb_waiters, &wait);
+ for (;;) {
+ current->state = TASK_UNINTERRUPTIBLE;
+ if (atomic_read(&pb->pb_pin_count) == 0)
+ break;
+ if (atomic_read(&pb->pb_io_remaining))
+ blk_run_queues();
+ schedule();
+ }
+ remove_wait_queue(&pb->pb_waiters, &wait);
+ current->state = TASK_RUNNING;
+}
+
+/*
+ * Buffer Utility Routines
+ */
+
+/*
+ * pagebuf_iodone
+ *
+ * pagebuf_iodone marks a buffer for which I/O is in progress
+ * done with respect to that I/O. The pb_iodone routine, if
+ * present, will be called as a side-effect.
+ */
+void
+pagebuf_iodone_work(
+ void *v)
+{
+ page_buf_t *pb = (page_buf_t *)v;
+
+ if (pb->pb_iodone) {
+ (*(pb->pb_iodone)) (pb);
+ return;
+ }
+
+ if (pb->pb_flags & PBF_ASYNC) {
+ if ((pb->pb_flags & _PBF_LOCKABLE) && !pb->pb_relse)
+ pagebuf_unlock(pb);
+ pagebuf_rele(pb);
+ }
+}
+
+void
+pagebuf_iodone(
+ page_buf_t *pb,
+ int dataio,
+ int schedule)
+{
+ pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
+ if (pb->pb_error == 0) {
+ pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE);
+ }
+
+ PB_TRACE(pb, "iodone", pb->pb_iodone);
+
+ if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
+ if (schedule) {
+ INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb);
+ queue_work(dataio ? pagebuf_dataio_workqueue :
+ pagebuf_logio_workqueue, &pb->pb_iodone_work);
+ } else {
+ pagebuf_iodone_work(pb);
+ }
+ } else {
+ up(&pb->pb_iodonesema);
+ }
+}
+
+/*
+ * pagebuf_ioerror
+ *
+ * pagebuf_ioerror sets the error code for a buffer.
+ */
+void
+pagebuf_ioerror( /* mark/clear buffer error flag */
+ page_buf_t *pb, /* buffer to mark */
+ unsigned int error) /* error to store (0 if none) */
+{
+ pb->pb_error = error;
+ PB_TRACE(pb, "ioerror", (unsigned long)error);
+}
+
+/*
+ * pagebuf_iostart
+ *
+ * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
+ * If necessary, it will arrange for any disk space allocation required,
+ * and it will break up the request if the block mappings require it.
+ * The pb_iodone routine in the buffer supplied will only be called
+ * when all of the subsidiary I/O requests, if any, have been completed.
+ * pagebuf_iostart calls the pagebuf_ioinitiate routine or
+ * pagebuf_iorequest, if the former routine is not defined, to start
+ * the I/O on a given low-level request.
+ */
+int
+pagebuf_iostart( /* start I/O on a buffer */
+ page_buf_t *pb, /* buffer to start */
+ page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
+ /* PBF_WRITE, PBF_DELWRI, */
+ /* PBF_SYNC, PBF_DONT_BLOCK */
+{
+ int status = 0;
+
+ PB_TRACE(pb, "iostart", (unsigned long)flags);
+
+ if (flags & PBF_DELWRI) {
+ pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
+ pb->pb_flags |= flags &
+ (PBF_DELWRI | PBF_ASYNC | PBF_SYNC);
+ pagebuf_delwri_queue(pb, 1);
+ return status;
+ }
+
+ pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | \
+ PBF_DELWRI | PBF_READ_AHEAD | PBF_RUN_QUEUES);
+ pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \
+ PBF_SYNC | PBF_READ_AHEAD | PBF_RUN_QUEUES);
+
+ BUG_ON(pb->pb_bn == PAGE_BUF_DADDR_NULL);
+
+ /* For writes allow an alternate strategy routine to precede
+ * the actual I/O request (which may not be issued at all in
+ * a shutdown situation, for example).
+ */
+ status = (flags & PBF_WRITE) ?
+ pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
+
+ /* Wait for I/O if we are not an async request.
+ * Note: async I/O request completion will release the buffer,
+ * and that can already be done by this point. So using the
+ * buffer pointer from here on, after async I/O, is invalid.
+ */
+ if (!status && !(flags & PBF_ASYNC))
+ status = pagebuf_iowait(pb);
+
+ return status;
+}
+
+/*
+ * Helper routine for pagebuf_iorequest
+ */
+
+STATIC __inline__ int
+_pagebuf_iolocked(
+ page_buf_t *pb)
+{
+ ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
+ if (pb->pb_flags & PBF_READ)
+ return pb->pb_locked;
+ return ((pb->pb_flags & _PBF_LOCKABLE) == 0);
+}
+
+STATIC __inline__ void
+_pagebuf_iodone(
+ page_buf_t *pb,
+ int schedule)
+{
+ if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
+ pb->pb_locked = 0;
+ pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
+ }
+}
+
+STATIC int
+bio_end_io_pagebuf(
+ struct bio *bio,
+ unsigned int bytes_done,
+ int error)
+{
+ page_buf_t *pb = (page_buf_t *)bio->bi_private;
+ unsigned int i, blocksize = pb->pb_target->pbr_bsize;
+ unsigned int sectorshift = pb->pb_target->pbr_sshift;
+ struct bio_vec *bvec = bio->bi_io_vec;
+
+ if (bio->bi_size)
+ return 1;
+
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ pb->pb_error = EIO;
+
+ for (i = 0; i < bio->bi_vcnt; i++, bvec++) {
+ struct page *page = bvec->bv_page;
+
+ if (pb->pb_error) {
+ SetPageError(page);
+ } else if (blocksize == PAGE_CACHE_SIZE) {
+ SetPageUptodate(page);
+ } else if (!PagePrivate(page)) {
+ unsigned int j, range;
+
+ ASSERT(blocksize < PAGE_CACHE_SIZE);
+ range = (bvec->bv_offset + bvec->bv_len) >> sectorshift;
+ for (j = bvec->bv_offset >> sectorshift; j < range; j++)
+ set_bit(j, &page->private);
+ if (page->private == (unsigned long)(PAGE_CACHE_SIZE-1))
+ SetPageUptodate(page);
+ }
+
+ if (_pagebuf_iolocked(pb)) {
+ unlock_page(page);
+ }
+ }
+
+ _pagebuf_iodone(pb, 1);
+ bio_put(bio);
+ return 0;
+}
+
+void
+_pagebuf_ioapply(
+ page_buf_t *pb)
+{
+ int i, map_i, total_nr_pages, nr_pages;
+ struct bio *bio;
+ int offset = pb->pb_offset;
+ int size = pb->pb_count_desired;
+ sector_t sector = pb->pb_bn;
+ unsigned int blocksize = pb->pb_target->pbr_bsize;
+ int locking = _pagebuf_iolocked(pb);
+
+ total_nr_pages = pb->pb_page_count;
+ map_i = 0;
+
+ /* Special code path for reading a sub page size pagebuf in --
+ * we populate up the whole page, and hence the other metadata
+ * in the same page. This optimization is only valid when the
+ * filesystem block size and the page size are equal.
+ */
+ if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
+ (pb->pb_flags & PBF_READ) && locking &&
+ (blocksize == PAGE_CACHE_SIZE)) {
+ bio = bio_alloc(GFP_NOIO, 1);
+
+ bio->bi_bdev = pb->pb_target->pbr_bdev;
+ bio->bi_sector = sector - (offset >> BBSHIFT);
+ bio->bi_end_io = bio_end_io_pagebuf;
+ bio->bi_private = pb;
+
+ bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0);
+ size = 0;
+
+ atomic_inc(&pb->pb_io_remaining);
+
+ goto submit_io;
+ }
+
+ /* Lock down the pages which we need to for the request */
+ if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) {
+ for (i = 0; size; i++) {
+ int nbytes = PAGE_CACHE_SIZE - offset;
+ struct page *page = pb->pb_pages[i];
+
+ if (nbytes > size)
+ nbytes = size;
+
+ lock_page(page);
+
+ size -= nbytes;
+ offset = 0;
+ }
+ offset = pb->pb_offset;
+ size = pb->pb_count_desired;
+ }
+
+next_chunk:
+ atomic_inc(&pb->pb_io_remaining);
+ nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
+ if (nr_pages > total_nr_pages)
+ nr_pages = total_nr_pages;
+
+ bio = bio_alloc(GFP_NOIO, nr_pages);
+ bio->bi_bdev = pb->pb_target->pbr_bdev;
+ bio->bi_sector = sector;
+ bio->bi_end_io = bio_end_io_pagebuf;
+ bio->bi_private = pb;
+
+ for (; size && nr_pages; nr_pages--, map_i++) {
+ int nbytes = PAGE_CACHE_SIZE - offset;
+
+ if (nbytes > size)
+ nbytes = size;
+
+ if (bio_add_page(bio, pb->pb_pages[map_i],
+ nbytes, offset) < nbytes)
+ break;
+
+ offset = 0;
+ sector += nbytes >> BBSHIFT;
+ size -= nbytes;
+ total_nr_pages--;
+ }
+
+submit_io:
+ if (likely(bio->bi_size)) {
+ submit_bio((pb->pb_flags & PBF_READ) ? READ : WRITE, bio);
+ if (size)
+ goto next_chunk;
+ } else {
+ bio_put(bio);
+ pagebuf_ioerror(pb, EIO);
+ }
+
+ if (pb->pb_flags & PBF_RUN_QUEUES) {
+ pb->pb_flags &= ~PBF_RUN_QUEUES;
+ if (atomic_read(&pb->pb_io_remaining) > 1)
+ blk_run_queues();
+ }
+}
+
+/*
+ * pagebuf_iorequest
+ *
+ * pagebuf_iorequest is the core I/O request routine.
+ * It assumes that the buffer is well-formed and
+ * mapped and ready for physical I/O, unlike
+ * pagebuf_iostart() and pagebuf_iophysio(). Those
+ * routines call the pagebuf_ioinitiate routine to start I/O,
+ * if it is present, or else call pagebuf_iorequest()
+ * directly if the pagebuf_ioinitiate routine is not present.
+ *
+ * This function will be responsible for ensuring access to the
+ * pages is restricted whilst I/O is in progress - for locking
+ * pagebufs the pagebuf lock is the mediator, for non-locking
+ * pagebufs the pages will be locked. In the locking case we
+ * need to use the pagebuf lock as multiple meta-data buffers
+ * will reference the same page.
+ */
+int
+pagebuf_iorequest( /* start real I/O */
+ page_buf_t *pb) /* buffer to convey to device */
+{
+ PB_TRACE(pb, "iorequest", 0);
+
+ if (pb->pb_flags & PBF_DELWRI) {
+ pagebuf_delwri_queue(pb, 1);
+ return 0;
+ }
+
+ if (pb->pb_flags & PBF_WRITE) {
+ _pagebuf_wait_unpin(pb);
+ }
+
+ pagebuf_hold(pb);
+
+ /* Set the count to 1 initially, this will stop an I/O
+ * completion callout which happens before we have started
+ * all the I/O from calling pagebuf_iodone too early.
+ */
+ atomic_set(&pb->pb_io_remaining, 1);
+ _pagebuf_ioapply(pb);
+ _pagebuf_iodone(pb, 0);
+
+ pagebuf_rele(pb);
+ return 0;
+}
+
+/*
+ * pagebuf_iowait
+ *
+ * pagebuf_iowait waits for I/O to complete on the buffer supplied.
+ * It returns immediately if no I/O is pending. In any case, it returns
+ * the error code, if any, or 0 if there is no error.
+ */
+int
+pagebuf_iowait(
+ page_buf_t *pb)
+{
+ PB_TRACE(pb, "iowait", 0);
+ if (atomic_read(&pb->pb_io_remaining))
+ blk_run_queues();
+ down(&pb->pb_iodonesema);
+ PB_TRACE(pb, "iowaited", (long)pb->pb_error);
+ return pb->pb_error;
+}
+
+STATIC void *
+pagebuf_mapout_locked(
+ page_buf_t *pb)
+{
+ void *old_addr = NULL;
+
+ if (pb->pb_flags & PBF_MAPPED) {
+ if (pb->pb_flags & _PBF_ADDR_ALLOCATED)
+ old_addr = pb->pb_addr - pb->pb_offset;
+ pb->pb_addr = NULL;
+ pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED);
+ }
+
+ return old_addr; /* Caller must free the address space,
+ * we are under a spin lock, probably
+ * not safe to do vfree here
+ */
+}
+
+caddr_t
+pagebuf_offset(
+ page_buf_t *pb,
+ size_t offset)
+{
+ struct page *page;
+
+ offset += pb->pb_offset;
+
+ page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
+ return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
+}
+
+/*
+ * pagebuf_iomove
+ *
+ * Move data into or out of a buffer.
+ */
+void
+pagebuf_iomove(
+ page_buf_t *pb, /* buffer to process */
+ size_t boff, /* starting buffer offset */
+ size_t bsize, /* length to copy */
+ caddr_t data, /* data address */
+ page_buf_rw_t mode) /* read/write flag */
+{
+ size_t bend, cpoff, csize;
+ struct page *page;
+
+ bend = boff + bsize;
+ while (boff < bend) {
+ page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
+ cpoff = page_buf_poff(boff + pb->pb_offset);
+ csize = min_t(size_t,
+ PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
+
+ ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
+
+ switch (mode) {
+ case PBRW_ZERO:
+ memset(page_address(page) + cpoff, 0, csize);
+ break;
+ case PBRW_READ:
+ memcpy(data, page_address(page) + cpoff, csize);
+ break;
+ case PBRW_WRITE:
+ memcpy(page_address(page) + cpoff, data, csize);
+ }
+
+ boff += csize;
+ data += csize;
+ }
+}
+
+
+/*
+ * Pagebuf delayed write buffer handling
+ */
+
+STATIC int pbd_active = 1;
+STATIC LIST_HEAD(pbd_delwrite_queue);
+STATIC spinlock_t pbd_delwrite_lock = SPIN_LOCK_UNLOCKED;
+
+STATIC void
+pagebuf_delwri_queue(
+ page_buf_t *pb,
+ int unlock)
+{
+ PB_TRACE(pb, "delwri_q", (long)unlock);
+ spin_lock(&pbd_delwrite_lock);
+ /* If already in the queue, dequeue and place at tail */
+ if (!list_empty(&pb->pb_list)) {
+ if (unlock) {
+ atomic_dec(&pb->pb_hold);
+ }
+ list_del(&pb->pb_list);
+ }
+
+ list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
+ pb->pb_flushtime = jiffies + pb_params.age_buffer.val;
+ spin_unlock(&pbd_delwrite_lock);
+
+ if (unlock && (pb->pb_flags & _PBF_LOCKABLE)) {
+ pagebuf_unlock(pb);
+ }
+}
+
+void
+pagebuf_delwri_dequeue(
+ page_buf_t *pb)
+{
+ PB_TRACE(pb, "delwri_uq", 0);
+ spin_lock(&pbd_delwrite_lock);
+ list_del_init(&pb->pb_list);
+ pb->pb_flags &= ~PBF_DELWRI;
+ spin_unlock(&pbd_delwrite_lock);
+}
+
+STATIC void
+pagebuf_runall_queues(
+ struct workqueue_struct *queue)
+{
+ flush_workqueue(queue);
+}
+
+/* Defines for pagebuf daemon */
+DECLARE_WAIT_QUEUE_HEAD(pbd_waitq);
+STATIC int force_flush;
+
+STATIC void
+pagebuf_daemon_wakeup(
+ int flag)
+{
+ force_flush = flag;
+ if (waitqueue_active(&pbd_waitq)) {
+ wake_up_interruptible(&pbd_waitq);
+ }
+}
+
+typedef void (*timeout_fn)(unsigned long);
+
+STATIC int
+pagebuf_daemon(
+ void *data)
+{
+ int count;
+ page_buf_t *pb;
+ struct list_head *curr, *next, tmp;
+ struct timer_list pb_daemon_timer =
+ TIMER_INITIALIZER((timeout_fn)pagebuf_daemon_wakeup, 0, 0);
+
+ /* Set up the thread */
+ daemonize("pagebufd");
+
+ current->flags |= PF_MEMALLOC;
+
+ INIT_LIST_HEAD(&tmp);
+ do {
+ /* swsusp */
+ if (current->flags & PF_FREEZE)
+ refrigerator(PF_IOTHREAD);
+
+ if (pbd_active == 1) {
+ mod_timer(&pb_daemon_timer,
+ jiffies + pb_params.flush_interval.val);
+ interruptible_sleep_on(&pbd_waitq);
+ }
+
+ if (pbd_active == 0) {
+ del_timer_sync(&pb_daemon_timer);
+ }
+
+ spin_lock(&pbd_delwrite_lock);
+
+ count = 0;
+ list_for_each_safe(curr, next, &pbd_delwrite_queue) {
+ pb = list_entry(curr, page_buf_t, pb_list);
+
+ PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
+
+ if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) &&
+ (((pb->pb_flags & _PBF_LOCKABLE) == 0) ||
+ !pagebuf_cond_lock(pb))) {
+
+ if (!force_flush &&
+ time_before(jiffies, pb->pb_flushtime)) {
+ pagebuf_unlock(pb);
+ break;
+ }
+
+ pb->pb_flags &= ~PBF_DELWRI;
+ pb->pb_flags |= PBF_WRITE;
+
+ list_del(&pb->pb_list);
+ list_add(&pb->pb_list, &tmp);
+
+ count++;
+ }
+ }
+
+ spin_unlock(&pbd_delwrite_lock);
+ while (!list_empty(&tmp)) {
+ pb = list_entry(tmp.next, page_buf_t, pb_list);
+ list_del_init(&pb->pb_list);
+
+ pagebuf_iostrategy(pb);
+ }
+
+ if (as_list_len > 0)
+ purge_addresses();
+ if (count)
+ blk_run_queues();
+
+ force_flush = 0;
+ } while (pbd_active == 1);
+
+ pbd_active = -1;
+ wake_up_interruptible(&pbd_waitq);
+
+ return 0;
+}
+
+void
+pagebuf_delwri_flush(
+ pb_target_t *target,
+ u_long flags,
+ int *pinptr)
+{
+ page_buf_t *pb;
+ struct list_head *curr, *next, tmp;
+ int pincount = 0;
+ int flush_cnt = 0;
+
+ pagebuf_runall_queues(pagebuf_dataio_workqueue);
+ pagebuf_runall_queues(pagebuf_logio_workqueue);
+
+ spin_lock(&pbd_delwrite_lock);
+ INIT_LIST_HEAD(&tmp);
+
+ list_for_each_safe(curr, next, &pbd_delwrite_queue) {
+ pb = list_entry(curr, page_buf_t, pb_list);
+
+ /*
+ * Skip other targets, markers and in progress buffers
+ */
+
+ if ((pb->pb_flags == 0) || (pb->pb_target != target) ||
+ !(pb->pb_flags & PBF_DELWRI)) {
+ continue;
+ }
+
+ PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
+ if (pagebuf_ispin(pb)) {
+ pincount++;
+ continue;
+ }
+
+ pb->pb_flags &= ~PBF_DELWRI;
+ pb->pb_flags |= PBF_WRITE;
+ list_move(&pb->pb_list, &tmp);
+ }
+ /* ok found all the items that can be worked on
+ * drop the lock and process the private list */
+ spin_unlock(&pbd_delwrite_lock);
+
+ list_for_each_safe(curr, next, &tmp) {
+ pb = list_entry(curr, page_buf_t, pb_list);
+
+ if (flags & PBDF_WAIT)
+ pb->pb_flags &= ~PBF_ASYNC;
+ else
+ list_del_init(curr);
+
+ pagebuf_lock(pb);
+ pagebuf_iostrategy(pb);
+ if (++flush_cnt > 32) {
+ blk_run_queues();
+ flush_cnt = 0;
+ }
+ }
+
+ blk_run_queues();
+
+ while (!list_empty(&tmp)) {
+ pb = list_entry(tmp.next, page_buf_t, pb_list);
+
+ list_del_init(&pb->pb_list);
+ pagebuf_iowait(pb);
+ if (!pb->pb_relse)
+ pagebuf_unlock(pb);
+ pagebuf_rele(pb);
+ }
+
+ if (pinptr)
+ *pinptr = pincount;
+}
+
+STATIC int
+pagebuf_daemon_start(void)
+{
+ int rval;
+
+ pagebuf_logio_workqueue = create_workqueue("xfslogd");
+ if (!pagebuf_logio_workqueue)
+ return -ENOMEM;
+
+ pagebuf_dataio_workqueue = create_workqueue("xfsdatad");
+ if (!pagebuf_dataio_workqueue) {
+ destroy_workqueue(pagebuf_logio_workqueue);
+ return -ENOMEM;
+ }
+
+ rval = kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES);
+ if (rval < 0) {
+ destroy_workqueue(pagebuf_logio_workqueue);
+ destroy_workqueue(pagebuf_dataio_workqueue);
+ }
+
+ return rval;
+}
+
+/*
+ * pagebuf_daemon_stop
+ *
+ * Note: do not mark as __exit, it is called from pagebuf_terminate.
+ */
+STATIC void
+pagebuf_daemon_stop(void)
+{
+ pbd_active = 0;
+ wake_up_interruptible(&pbd_waitq);
+ wait_event_interruptible(pbd_waitq, pbd_active);
+ destroy_workqueue(pagebuf_logio_workqueue);
+ destroy_workqueue(pagebuf_dataio_workqueue);
+}
+
+
+/*
+ * Pagebuf sysctl interface
+ */
+
+STATIC int
+pb_stats_clear_handler(
+ ctl_table *ctl,
+ int write,
+ struct file *filp,
+ void *buffer,
+ size_t *lenp)
+{
+ int c, ret;
+ int *valp = ctl->data;
+
+ ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp);
+
+ if (!ret && write && *valp) {
+ printk("XFS Clearing pbstats\n");
+ for (c = 0; c < NR_CPUS; c++) {
+ if (!cpu_possible(c)) continue;
+ memset(&per_cpu(pbstats, c), 0,
+ sizeof(struct pbstats));
+ }
+ pb_params.stats_clear.val = 0;
+ }
+
+ return ret;
+}
+
+STATIC struct ctl_table_header *pagebuf_table_header;
+
+STATIC ctl_table pagebuf_table[] = {
+ {PB_FLUSH_INT, "flush_int", &pb_params.flush_interval.val,
+ sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+ &sysctl_intvec, NULL,
+ &pb_params.flush_interval.min, &pb_params.flush_interval.max},
+
+ {PB_FLUSH_AGE, "flush_age", &pb_params.age_buffer.val,
+ sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+ &sysctl_intvec, NULL,
+ &pb_params.age_buffer.min, &pb_params.age_buffer.max},
+
+ {PB_STATS_CLEAR, "stats_clear", &pb_params.stats_clear.val,
+ sizeof(int), 0644, NULL, &pb_stats_clear_handler,
+ &sysctl_intvec, NULL,
+ &pb_params.stats_clear.min, &pb_params.stats_clear.max},
+
+#ifdef PAGEBUF_TRACE
+ {PB_DEBUG, "debug", &pb_params.debug.val,
+ sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+ &sysctl_intvec, NULL,
+ &pb_params.debug.min, &pb_params.debug.max},
+#endif
+ {0}
+};
+
+STATIC ctl_table pagebuf_dir_table[] = {
+ {VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table},
+ {0}
+};
+
+STATIC ctl_table pagebuf_root_table[] = {
+ {CTL_VM, "vm", NULL, 0, 0555, pagebuf_dir_table},
+ {0}
+};
+
+#ifdef CONFIG_PROC_FS
+STATIC int
+pagebuf_readstats(
+ char *buffer,
+ char **start,
+ off_t offset,
+ int count,
+ int *eof,
+ void *data)
+{
+ int c, i, len, val;
+
+ len = 0;
+ len += sprintf(buffer + len, "pagebuf");
+ for (i = 0; i < sizeof(struct pbstats) / sizeof(u_int32_t); i++) {
+ val = 0;
+ for (c = 0 ; c < NR_CPUS; c++) {
+ if (!cpu_possible(c)) continue;
+ val += *(((u_int32_t*)&per_cpu(pbstats, c) + i));
+ }
+ len += sprintf(buffer + len, " %u", val);
+ }
+ buffer[len++] = '\n';
+
+ if (offset >= len) {
+ *start = buffer;
+ *eof = 1;
+ return 0;
+ }
+ *start = buffer + offset;
+ if ((len -= offset) > count)
+ return count;
+ *eof = 1;
+
+ return len;
+}
+#endif /* CONFIG_PROC_FS */
+
+/*
+ * Initialization and Termination
+ */
+
+int __init
+pagebuf_init(void)
+{
+ int i;
+
+ pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1);
+
+#ifdef CONFIG_PROC_FS
+ if (proc_mkdir("fs/pagebuf", 0))
+ create_proc_read_entry(
+ "fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL);
+#endif
+
+ pagebuf_cache = kmem_cache_create("page_buf_t", sizeof(page_buf_t), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (pagebuf_cache == NULL) {
+ printk("pagebuf: couldn't init pagebuf cache\n");
+ pagebuf_terminate();
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < NHASH; i++) {
+ spin_lock_init(&pbhash[i].pb_hash_lock);
+ INIT_LIST_HEAD(&pbhash[i].pb_hash);
+ }
+
+#ifdef PAGEBUF_TRACE
+ pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
+#endif
+
+ pagebuf_daemon_start();
+ return 0;
+}
+
+
+/*
+ * pagebuf_terminate.
+ *
+ * Note: do not mark as __exit, this is also called from the __init code.
+ */
+void
+pagebuf_terminate(void)
+{
+ pagebuf_daemon_stop();
+
+ kmem_cache_destroy(pagebuf_cache);
+
+ unregister_sysctl_table(pagebuf_table_header);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("fs/pagebuf/stat", NULL);
+ remove_proc_entry("fs/pagebuf", NULL);
+#endif
+}
+
+
+/*
+ * Module management (for kernel debugger module)
+ */
+EXPORT_SYMBOL(pagebuf_offset);
+#ifdef DEBUG
+EXPORT_SYMBOL(pbd_delwrite_queue);
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI
+ */
+
+#ifndef __XFS_BUF_H__
+#define __XFS_BUF_H__
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <asm/system.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/uio.h>
+
+/*
+ * Base types
+ */
+
+/* daddr must be signed since -1 is used for bmaps that are not yet allocated */
+typedef loff_t page_buf_daddr_t;
+
+#define PAGE_BUF_DADDR_NULL ((page_buf_daddr_t) (-1LL))
+
+#define page_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE)
+#define page_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT)
+#define page_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT)
+#define page_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK)
+
+typedef enum page_buf_rw_e {
+ PBRW_READ = 1, /* transfer into target memory */
+ PBRW_WRITE = 2, /* transfer from target memory */
+ PBRW_ZERO = 3 /* Zero target memory */
+} page_buf_rw_t;
+
+
+typedef enum page_buf_flags_e { /* pb_flags values */
+ PBF_READ = (1 << 0), /* buffer intended for reading from device */
+ PBF_WRITE = (1 << 1), /* buffer intended for writing to device */
+ PBF_MAPPED = (1 << 2), /* buffer mapped (pb_addr valid) */
+ PBF_PARTIAL = (1 << 3), /* buffer partially read */
+ PBF_ASYNC = (1 << 4), /* initiator will not wait for completion */
+ PBF_NONE = (1 << 5), /* buffer not read at all */
+ PBF_DELWRI = (1 << 6), /* buffer has dirty pages */
+ PBF_FREED = (1 << 7), /* buffer has been freed and is invalid */
+ PBF_SYNC = (1 << 8), /* force updates to disk */
+ PBF_MAPPABLE = (1 << 9),/* use directly-addressable pages */
+ PBF_STALE = (1 << 10), /* buffer has been staled, do not find it */
+ PBF_FS_MANAGED = (1 << 11), /* filesystem controls freeing memory */
+ PBF_FS_DATAIOD = (1 << 12), /* schedule IO completion on fs datad */
+
+ /* flags used only as arguments to access routines */
+ PBF_LOCK = (1 << 13), /* lock requested */
+ PBF_TRYLOCK = (1 << 14), /* lock requested, but do not wait */
+ PBF_DONT_BLOCK = (1 << 15), /* do not block in current thread */
+
+ /* flags used only internally */
+ _PBF_LOCKABLE = (1 << 16), /* page_buf_t may be locked */
+ _PBF_PRIVATE_BH = (1 << 17), /* do not use public buffer heads */
+ _PBF_ALL_PAGES_MAPPED = (1 << 18), /* all pages in range mapped */
+ _PBF_ADDR_ALLOCATED = (1 << 19), /* pb_addr space was allocated */
+ _PBF_MEM_ALLOCATED = (1 << 20), /* underlying pages are allocated */
+ _PBF_MEM_SLAB = (1 << 21), /* underlying pages are slab allocated */
+
+ PBF_FORCEIO = (1 << 22), /* ignore any cache state */
+ PBF_FLUSH = (1 << 23), /* flush disk write cache */
+ PBF_READ_AHEAD = (1 << 24), /* asynchronous read-ahead */
+ PBF_RUN_QUEUES = (1 << 25), /* run block device task queue */
+
+} page_buf_flags_t;
+
+#define PBF_UPDATE (PBF_READ | PBF_WRITE)
+#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0)
+#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0)
+
+typedef struct pb_target {
+ dev_t pbr_dev;
+ struct block_device *pbr_bdev;
+ struct address_space *pbr_mapping;
+ unsigned int pbr_bsize;
+ unsigned int pbr_sshift;
+ size_t pbr_smask;
+} pb_target_t;
+
+/*
+ * page_buf_t: Buffer structure for page cache-based buffers
+ *
+ * This buffer structure is used by the page cache buffer management routines
+ * to refer to an assembly of pages forming a logical buffer. The actual
+ * I/O is performed with buffer_head or bio structures, as required by drivers,
+ * for drivers which do not understand this structure. The buffer structure is
+ * used on temporary basis only, and discarded when released.
+ *
+ * The real data storage is recorded in the page cache. Metadata is
+ * hashed to the inode for the block device on which the file system resides.
+ * File data is hashed to the inode for the file. Pages which are only
+ * partially filled with data have bits set in their block_map entry
+ * to indicate which disk blocks in the page are not valid.
+ */
+
+struct page_buf_s;
+typedef void (*page_buf_iodone_t)(struct page_buf_s *);
+ /* call-back function on I/O completion */
+typedef void (*page_buf_relse_t)(struct page_buf_s *);
+ /* call-back function on I/O completion */
+typedef int (*page_buf_bdstrat_t)(struct page_buf_s *);
+
+#define PB_PAGES 4
+
+typedef struct page_buf_s {
+ struct semaphore pb_sema; /* semaphore for lockables */
+ unsigned long pb_flushtime; /* time to flush pagebuf */
+ atomic_t pb_pin_count; /* pin count */
+ wait_queue_head_t pb_waiters; /* unpin waiters */
+ struct list_head pb_list;
+ page_buf_flags_t pb_flags; /* status flags */
+ struct list_head pb_hash_list;
+ struct pb_target *pb_target; /* logical object */
+ atomic_t pb_hold; /* reference count */
+ page_buf_daddr_t pb_bn; /* block number for I/O */
+ loff_t pb_file_offset; /* offset in file */
+ size_t pb_buffer_length; /* size of buffer in bytes */
+ size_t pb_count_desired; /* desired transfer size */
+ void *pb_addr; /* virtual address of buffer */
+ struct work_struct pb_iodone_work;
+ atomic_t pb_io_remaining;/* #outstanding I/O requests */
+ page_buf_iodone_t pb_iodone; /* I/O completion function */
+ page_buf_relse_t pb_relse; /* releasing function */
+ page_buf_bdstrat_t pb_strat; /* pre-write function */
+ struct semaphore pb_iodonesema; /* Semaphore for I/O waiters */
+ void *pb_fspriv;
+ void *pb_fspriv2;
+ void *pb_fspriv3;
+ unsigned short pb_error; /* error code on I/O */
+ unsigned short pb_page_count; /* size of page array */
+ unsigned short pb_offset; /* page offset in first page */
+ unsigned char pb_locked; /* page array is locked */
+ unsigned char pb_hash_index; /* hash table index */
+ struct page **pb_pages; /* array of page pointers */
+ struct page *pb_page_array[PB_PAGES]; /* inline pages */
+#ifdef PAGEBUF_LOCK_TRACKING
+ int pb_last_holder;
+#endif
+} page_buf_t;
+
+
+/* Finding and Reading Buffers */
+
+extern page_buf_t *pagebuf_find( /* find buffer for block if */
+ /* the block is in memory */
+ struct pb_target *, /* inode for block */
+ loff_t, /* starting offset of range */
+ size_t, /* length of range */
+ page_buf_flags_t); /* PBF_LOCK */
+
+extern page_buf_t *pagebuf_get( /* allocate a buffer */
+ struct pb_target *, /* inode for buffer */
+ loff_t, /* starting offset of range */
+ size_t, /* length of range */
+ page_buf_flags_t); /* PBF_LOCK, PBF_READ, */
+ /* PBF_ASYNC */
+
+extern page_buf_t *pagebuf_lookup(
+ struct pb_target *,
+ loff_t, /* starting offset of range */
+ size_t, /* length of range */
+ page_buf_flags_t); /* PBF_READ, PBF_WRITE, */
+ /* PBF_FORCEIO, _PBF_LOCKABLE */
+
+extern page_buf_t *pagebuf_get_empty( /* allocate pagebuf struct with */
+ /* no memory or disk address */
+ size_t len,
+ struct pb_target *); /* mount point "fake" inode */
+
+extern page_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct */
+ /* without disk address */
+ size_t len,
+ struct pb_target *); /* mount point "fake" inode */
+
+extern int pagebuf_associate_memory(
+ page_buf_t *,
+ void *,
+ size_t);
+
+extern void pagebuf_hold( /* increment reference count */
+ page_buf_t *); /* buffer to hold */
+
+extern void pagebuf_readahead( /* read ahead into cache */
+ struct pb_target *, /* target for buffer (or NULL) */
+ loff_t, /* starting offset of range */
+ size_t, /* length of range */
+ page_buf_flags_t); /* additional read flags */
+
+/* Releasing Buffers */
+
+extern void pagebuf_free( /* deallocate a buffer */
+ page_buf_t *); /* buffer to deallocate */
+
+extern void pagebuf_rele( /* release hold on a buffer */
+ page_buf_t *); /* buffer to release */
+
+/* Locking and Unlocking Buffers */
+
+extern int pagebuf_cond_lock( /* lock buffer, if not locked */
+ /* (returns -EBUSY if locked) */
+ page_buf_t *); /* buffer to lock */
+
+extern int pagebuf_lock_value( /* return count on lock */
+ page_buf_t *); /* buffer to check */
+
+extern int pagebuf_lock( /* lock buffer */
+ page_buf_t *); /* buffer to lock */
+
+extern void pagebuf_unlock( /* unlock buffer */
+ page_buf_t *); /* buffer to unlock */
+
+/* Buffer Read and Write Routines */
+
+extern void pagebuf_iodone( /* mark buffer I/O complete */
+ page_buf_t *, /* buffer to mark */
+ int, /* use data/log helper thread. */
+ int); /* run completion locally, or in
+ * a helper thread. */
+
+extern void pagebuf_ioerror( /* mark buffer in error (or not) */
+ page_buf_t *, /* buffer to mark */
+ unsigned int); /* error to store (0 if none) */
+
+extern int pagebuf_iostart( /* start I/O on a buffer */
+ page_buf_t *, /* buffer to start */
+ page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC, */
+ /* PBF_READ, PBF_WRITE, */
+ /* PBF_DELWRI, PBF_SYNC */
+
+extern int pagebuf_iorequest( /* start real I/O */
+ page_buf_t *); /* buffer to convey to device */
+
+extern int pagebuf_iowait( /* wait for buffer I/O done */
+ page_buf_t *); /* buffer to wait on */
+
+extern void pagebuf_iomove( /* move data in/out of pagebuf */
+ page_buf_t *, /* buffer to manipulate */
+ size_t, /* starting buffer offset */
+ size_t, /* length in buffer */
+ caddr_t, /* data pointer */
+ page_buf_rw_t); /* direction */
+
+static inline int pagebuf_iostrategy(page_buf_t *pb)
+{
+ return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb);
+}
+
+static inline int pagebuf_geterror(page_buf_t *pb)
+{
+ return pb ? pb->pb_error : ENOMEM;
+}
+
+/* Buffer Utility Routines */
+
+extern caddr_t pagebuf_offset( /* pointer at offset in buffer */
+ page_buf_t *, /* buffer to offset into */
+ size_t); /* offset */
+
+/* Pinning Buffer Storage in Memory */
+
+extern void pagebuf_pin( /* pin buffer in memory */
+ page_buf_t *); /* buffer to pin */
+
+extern void pagebuf_unpin( /* unpin buffered data */
+ page_buf_t *); /* buffer to unpin */
+
+extern int pagebuf_ispin( /* check if buffer is pinned */
+ page_buf_t *); /* buffer to check */
+
+/* Delayed Write Buffer Routines */
+
+#define PBDF_WAIT 0x01
+extern void pagebuf_delwri_flush(
+ pb_target_t *,
+ unsigned long,
+ int *);
+
+extern void pagebuf_delwri_dequeue(
+ page_buf_t *);
+
+/* Buffer Daemon Setup Routines */
+
+extern int pagebuf_init(void);
+extern void pagebuf_terminate(void);
+
+
+#ifdef PAGEBUF_TRACE
+extern ktrace_t *pagebuf_trace_buf;
+extern void pagebuf_trace(
+ page_buf_t *, /* buffer being traced */
+ char *, /* description of operation */
+ void *, /* arbitrary diagnostic value */
+ void *); /* return address */
+#else
+# define pagebuf_trace(pb, id, ptr, ra) do { } while (0)
+#endif
+
+#define pagebuf_target_name(target) \
+ ({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; })
+
+
+
+
+
+/* These are just for xfs_syncsub... it sets an internal variable
+ * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t
+ */
+#define XFS_B_ASYNC PBF_ASYNC
+#define XFS_B_DELWRI PBF_DELWRI
+#define XFS_B_READ PBF_READ
+#define XFS_B_WRITE PBF_WRITE
+#define XFS_B_STALE PBF_STALE
+
+#define XFS_BUF_TRYLOCK PBF_TRYLOCK
+#define XFS_INCORE_TRYLOCK PBF_TRYLOCK
+#define XFS_BUF_LOCK PBF_LOCK
+#define XFS_BUF_MAPPED PBF_MAPPED
+
+#define BUF_BUSY PBF_DONT_BLOCK
+
+#define XFS_BUF_BFLAGS(x) ((x)->pb_flags)
+#define XFS_BUF_ZEROFLAGS(x) \
+ ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_DELWRI))
+
+#define XFS_BUF_STALE(x) ((x)->pb_flags |= XFS_B_STALE)
+#define XFS_BUF_UNSTALE(x) ((x)->pb_flags &= ~XFS_B_STALE)
+#define XFS_BUF_ISSTALE(x) ((x)->pb_flags & XFS_B_STALE)
+#define XFS_BUF_SUPER_STALE(x) do { \
+ XFS_BUF_STALE(x); \
+ xfs_buf_undelay(x); \
+ XFS_BUF_DONE(x); \
+ } while (0)
+
+#define XFS_BUF_MANAGE PBF_FS_MANAGED
+#define XFS_BUF_UNMANAGE(x) ((x)->pb_flags &= ~PBF_FS_MANAGED)
+
+static inline void xfs_buf_undelay(page_buf_t *pb)
+{
+ if (pb->pb_flags & PBF_DELWRI) {
+ if (pb->pb_list.next != &pb->pb_list) {
+ pagebuf_delwri_dequeue(pb);
+ pagebuf_rele(pb);
+ } else {
+ pb->pb_flags &= ~PBF_DELWRI;
+ }
+ }
+}
+
+#define XFS_BUF_DELAYWRITE(x) ((x)->pb_flags |= PBF_DELWRI)
+#define XFS_BUF_UNDELAYWRITE(x) xfs_buf_undelay(x)
+#define XFS_BUF_ISDELAYWRITE(x) ((x)->pb_flags & PBF_DELWRI)
+
+#define XFS_BUF_ERROR(x,no) pagebuf_ioerror(x,no)
+#define XFS_BUF_GETERROR(x) pagebuf_geterror(x)
+#define XFS_BUF_ISERROR(x) (pagebuf_geterror(x)?1:0)
+
+#define XFS_BUF_DONE(x) ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE))
+#define XFS_BUF_UNDONE(x) ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE)
+#define XFS_BUF_ISDONE(x) (!(PBF_NOT_DONE(x)))
+
+#define XFS_BUF_BUSY(x) ((x)->pb_flags |= PBF_FORCEIO)
+#define XFS_BUF_UNBUSY(x) ((x)->pb_flags &= ~PBF_FORCEIO)
+#define XFS_BUF_ISBUSY(x) (1)
+
+#define XFS_BUF_ASYNC(x) ((x)->pb_flags |= PBF_ASYNC)
+#define XFS_BUF_UNASYNC(x) ((x)->pb_flags &= ~PBF_ASYNC)
+#define XFS_BUF_ISASYNC(x) ((x)->pb_flags & PBF_ASYNC)
+
+#define XFS_BUF_FLUSH(x) ((x)->pb_flags |= PBF_FLUSH)
+#define XFS_BUF_UNFLUSH(x) ((x)->pb_flags &= ~PBF_FLUSH)
+#define XFS_BUF_ISFLUSH(x) ((x)->pb_flags & PBF_FLUSH)
+
+#define XFS_BUF_SHUT(x) printk("XFS_BUF_SHUT not implemented yet\n")
+#define XFS_BUF_UNSHUT(x) printk("XFS_BUF_UNSHUT not implemented yet\n")
+#define XFS_BUF_ISSHUT(x) (0)
+
+#define XFS_BUF_HOLD(x) pagebuf_hold(x)
+#define XFS_BUF_READ(x) ((x)->pb_flags |= PBF_READ)
+#define XFS_BUF_UNREAD(x) ((x)->pb_flags &= ~PBF_READ)
+#define XFS_BUF_ISREAD(x) ((x)->pb_flags & PBF_READ)
+
+#define XFS_BUF_WRITE(x) ((x)->pb_flags |= PBF_WRITE)
+#define XFS_BUF_UNWRITE(x) ((x)->pb_flags &= ~PBF_WRITE)
+#define XFS_BUF_ISWRITE(x) ((x)->pb_flags & PBF_WRITE)
+
+#define XFS_BUF_ISUNINITIAL(x) (0)
+#define XFS_BUF_UNUNINITIAL(x) (0)
+
+#define XFS_BUF_BP_ISMAPPED(bp) 1
+
+typedef struct page_buf_s xfs_buf_t;
+#define xfs_buf page_buf_s
+
+typedef struct pb_target xfs_buftarg_t;
+#define xfs_buftarg pb_target
+
+#define XFS_BUF_DATAIO(x) ((x)->pb_flags |= PBF_FS_DATAIOD)
+#define XFS_BUF_UNDATAIO(x) ((x)->pb_flags &= ~PBF_FS_DATAIOD)
+
+#define XFS_BUF_IODONE_FUNC(buf) (buf)->pb_iodone
+#define XFS_BUF_SET_IODONE_FUNC(buf, func) \
+ (buf)->pb_iodone = (func)
+#define XFS_BUF_CLR_IODONE_FUNC(buf) \
+ (buf)->pb_iodone = NULL
+#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func) \
+ (buf)->pb_strat = (func)
+#define XFS_BUF_CLR_BDSTRAT_FUNC(buf) \
+ (buf)->pb_strat = NULL
+
+#define XFS_BUF_FSPRIVATE(buf, type) \
+ ((type)(buf)->pb_fspriv)
+#define XFS_BUF_SET_FSPRIVATE(buf, value) \
+ (buf)->pb_fspriv = (void *)(value)
+#define XFS_BUF_FSPRIVATE2(buf, type) \
+ ((type)(buf)->pb_fspriv2)
+#define XFS_BUF_SET_FSPRIVATE2(buf, value) \
+ (buf)->pb_fspriv2 = (void *)(value)
+#define XFS_BUF_FSPRIVATE3(buf, type) \
+ ((type)(buf)->pb_fspriv3)
+#define XFS_BUF_SET_FSPRIVATE3(buf, value) \
+ (buf)->pb_fspriv3 = (void *)(value)
+#define XFS_BUF_SET_START(buf)
+
+#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \
+ (buf)->pb_relse = (value)
+
+#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->pb_addr)
+
+extern inline xfs_caddr_t xfs_buf_offset(page_buf_t *bp, size_t offset)
+{
+ if (bp->pb_flags & PBF_MAPPED)
+ return XFS_BUF_PTR(bp) + offset;
+ return (xfs_caddr_t) pagebuf_offset(bp, offset);
+}
+
+#define XFS_BUF_SET_PTR(bp, val, count) \
+ pagebuf_associate_memory(bp, val, count)
+#define XFS_BUF_ADDR(bp) ((bp)->pb_bn)
+#define XFS_BUF_SET_ADDR(bp, blk) \
+ ((bp)->pb_bn = (page_buf_daddr_t)(blk))
+#define XFS_BUF_OFFSET(bp) ((bp)->pb_file_offset)
+#define XFS_BUF_SET_OFFSET(bp, off) \
+ ((bp)->pb_file_offset = (off))
+#define XFS_BUF_COUNT(bp) ((bp)->pb_count_desired)
+#define XFS_BUF_SET_COUNT(bp, cnt) \
+ ((bp)->pb_count_desired = (cnt))
+#define XFS_BUF_SIZE(bp) ((bp)->pb_buffer_length)
+#define XFS_BUF_SET_SIZE(bp, cnt) \
+ ((bp)->pb_buffer_length = (cnt))
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
+#define XFS_BUF_SET_VTYPE(bp, type)
+#define XFS_BUF_SET_REF(bp, ref)
+
+#define XFS_BUF_ISPINNED(bp) pagebuf_ispin(bp)
+
+#define XFS_BUF_VALUSEMA(bp) pagebuf_lock_value(bp)
+#define XFS_BUF_CPSEMA(bp) (pagebuf_cond_lock(bp) == 0)
+#define XFS_BUF_VSEMA(bp) pagebuf_unlock(bp)
+#define XFS_BUF_PSEMA(bp,x) pagebuf_lock(bp)
+#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema);
+
+/* setup the buffer target from a buftarg structure */
+#define XFS_BUF_SET_TARGET(bp, target) \
+ (bp)->pb_target = (target)
+#define XFS_BUF_TARGET(bp) ((bp)->pb_target)
+#define XFS_BUFTARG_NAME(target) \
+ pagebuf_target_name(target)
+
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
+#define XFS_BUF_SET_VTYPE(bp, type)
+#define XFS_BUF_SET_REF(bp, ref)
+
+#define xfs_buf_read(target, blkno, len, flags) \
+ pagebuf_get((target), (blkno), (len), \
+ PBF_LOCK | PBF_READ | PBF_MAPPED | PBF_MAPPABLE)
+#define xfs_buf_get(target, blkno, len, flags) \
+ pagebuf_get((target), (blkno), (len), \
+ PBF_LOCK | PBF_MAPPED | PBF_MAPPABLE)
+
+#define xfs_buf_read_flags(target, blkno, len, flags) \
+ pagebuf_get((target), (blkno), (len), \
+ PBF_READ | PBF_MAPPABLE | flags)
+#define xfs_buf_get_flags(target, blkno, len, flags) \
+ pagebuf_get((target), (blkno), (len), \
+ PBF_MAPPABLE | flags)
+
+static inline int xfs_bawrite(void *mp, page_buf_t *bp)
+{
+ bp->pb_fspriv3 = mp;
+ bp->pb_strat = xfs_bdstrat_cb;
+ xfs_buf_undelay(bp);
+ return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | PBF_RUN_QUEUES);
+}
+
+static inline void xfs_buf_relse(page_buf_t *bp)
+{
+ if ((bp->pb_flags & _PBF_LOCKABLE) && !bp->pb_relse)
+ pagebuf_unlock(bp);
+ pagebuf_rele(bp);
+}
+
+#define xfs_bpin(bp) pagebuf_pin(bp)
+#define xfs_bunpin(bp) pagebuf_unpin(bp)
+
+#define xfs_buftrace(id, bp) \
+ pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
+
+#define xfs_biodone(pb) \
+ pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), 0)
+
+#define xfs_incore(buftarg,blkno,len,lockit) \
+ pagebuf_find(buftarg, blkno ,len, lockit)
+
+
+#define xfs_biomove(pb, off, len, data, rw) \
+ pagebuf_iomove((pb), (off), (len), (data), \
+ ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ)
+
+#define xfs_biozero(pb, off, len) \
+ pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO)
+
+
+static inline int XFS_bwrite(page_buf_t *pb)
+{
+ int iowait = (pb->pb_flags & PBF_ASYNC) == 0;
+ int error = 0;
+
+ pb->pb_flags |= PBF_SYNC;
+ if (!iowait)
+ pb->pb_flags |= PBF_RUN_QUEUES;
+
+ xfs_buf_undelay(pb);
+ pagebuf_iostrategy(pb);
+ if (iowait) {
+ error = pagebuf_iowait(pb);
+ xfs_buf_relse(pb);
+ }
+ return error;
+}
+
+#define XFS_bdwrite(pb) \
+ pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC)
+
+static inline int xfs_bdwrite(void *mp, page_buf_t *bp)
+{
+ bp->pb_strat = xfs_bdstrat_cb;
+ bp->pb_fspriv3 = mp;
+
+ return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC);
+}
+
+#define XFS_bdstrat(bp) pagebuf_iorequest(bp)
+
+#define xfs_iowait(pb) pagebuf_iowait(pb)
+
+
+/*
+ * Go through all incore buffers, and release buffers
+ * if they belong to the given device. This is used in
+ * filesystem error handling to preserve the consistency
+ * of its metadata.
+ */
+
+#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg)
+
+#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg)
+
+#define xfs_incore_relse(buftarg,delwri_only,wait) \
+ xfs_relse_buftarg(buftarg)
+
+#define xfs_baread(target, rablkno, ralen) \
+ pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK)
+
+#define xfs_buf_get_empty(len, target) pagebuf_get_empty((len), (target))
+#define xfs_buf_get_noaddr(len, target) pagebuf_get_no_daddr((len), (target))
+#define xfs_buf_free(bp) pagebuf_free(bp)
+
+#endif /* __XFS_BUF_H__ */
+
+++ /dev/null
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-
-#include "xfs.h"
-
-#include "xfs_fs.h"
-#include "xfs_inum.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_bit.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_trans_space.h"
-#include "xfs_utils.h"
-#include "xfs_iomap.h"
-
-#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
- << mp->m_writeio_log)
-#define XFS_STRAT_WRITE_IMAPS 2
-#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
-
-STATIC int
-xfs_imap_to_bmap(
- xfs_iocore_t *io,
- xfs_off_t offset,
- xfs_bmbt_irec_t *imap,
- xfs_iomap_t *iomapp,
- int imaps, /* Number of imap entries */
- int iomaps, /* Number of iomap entries */
- int flags)
-{
- xfs_mount_t *mp;
- xfs_fsize_t nisize;
- int pbm;
- xfs_fsblock_t start_block;
-
- mp = io->io_mount;
- nisize = XFS_SIZE(mp, io);
- if (io->io_new_size > nisize)
- nisize = io->io_new_size;
-
- for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
- iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
- iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
- iomapp->iomap_delta = offset - iomapp->iomap_offset;
- iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
- iomapp->iomap_flags = flags;
-
- start_block = imap->br_startblock;
- if (start_block == HOLESTARTBLOCK) {
- iomapp->iomap_bn = IOMAP_DADDR_NULL;
- iomapp->iomap_flags = IOMAP_HOLE;
- } else if (start_block == DELAYSTARTBLOCK) {
- iomapp->iomap_bn = IOMAP_DADDR_NULL;
- iomapp->iomap_flags = IOMAP_DELAY;
- } else {
- iomapp->iomap_bn = XFS_FSB_TO_DB_IO(io, start_block);
- if (ISUNWRITTEN(imap))
- iomapp->iomap_flags |= IOMAP_UNWRITTEN;
- }
-
- if ((iomapp->iomap_offset + iomapp->iomap_bsize) >= nisize) {
- iomapp->iomap_flags |= IOMAP_EOF;
- }
-
- offset += iomapp->iomap_bsize - iomapp->iomap_delta;
- }
- return pbm; /* Return the number filled */
-}
-
-int
-xfs_iomap(
- xfs_iocore_t *io,
- xfs_off_t offset,
- ssize_t count,
- int flags,
- xfs_iomap_t *iomapp,
- int *niomaps)
-{
- xfs_mount_t *mp = io->io_mount;
- xfs_fileoff_t offset_fsb, end_fsb;
- int error = 0;
- int lockmode = 0;
- xfs_bmbt_irec_t imap;
- int nimaps = 1;
- int bmapi_flags = 0;
- int iomap_flags = 0;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- switch (flags &
- (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE |
- BMAPI_UNWRITTEN | BMAPI_DEVICE)) {
- case BMAPI_READ:
- lockmode = XFS_LCK_MAP_SHARED(mp, io);
- bmapi_flags = XFS_BMAPI_ENTIRE;
- if (flags & BMAPI_IGNSTATE)
- bmapi_flags |= XFS_BMAPI_IGSTATE;
- break;
- case BMAPI_WRITE:
- lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR;
- bmapi_flags = 0;
- XFS_ILOCK(mp, io, lockmode);
- break;
- case BMAPI_ALLOCATE:
- lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD;
- bmapi_flags = XFS_BMAPI_ENTIRE;
- /* Attempt non-blocking lock */
- if (flags & BMAPI_TRYLOCK) {
- if (!XFS_ILOCK_NOWAIT(mp, io, lockmode))
- return XFS_ERROR(EAGAIN);
- } else {
- XFS_ILOCK(mp, io, lockmode);
- }
- break;
- case BMAPI_UNWRITTEN:
- goto phase2;
- case BMAPI_DEVICE:
- lockmode = XFS_LCK_MAP_SHARED(mp, io);
- iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
- error = 0;
- *niomaps = 1;
- goto out;
- default:
- BUG();
- }
-
- ASSERT(offset <= mp->m_maxioffset);
- if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
- count = mp->m_maxioffset - offset;
- end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
-
- error = XFS_BMAPI(mp, NULL, io, offset_fsb,
- (xfs_filblks_t)(end_fsb - offset_fsb),
- bmapi_flags, NULL, 0, &imap,
- &nimaps, NULL);
-
- if (error)
- goto out;
-
-phase2:
- switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) {
- case BMAPI_WRITE:
- /* If we found an extent, return it */
- if (nimaps && (imap.br_startblock != HOLESTARTBLOCK))
- break;
-
- if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
- error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset,
- count, flags, &imap, &nimaps, nimaps);
- } else {
- error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count,
- flags, &imap, &nimaps);
- }
- iomap_flags = IOMAP_NEW;
- break;
- case BMAPI_ALLOCATE:
- /* If we found an extent, return it */
- XFS_IUNLOCK(mp, io, lockmode);
- lockmode = 0;
-
- if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock))
- break;
-
- error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, &imap, &nimaps);
- break;
- case BMAPI_UNWRITTEN:
- lockmode = 0;
- error = XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count);
- nimaps = 0;
- break;
- }
-
- if (nimaps) {
- *niomaps = xfs_imap_to_bmap(io, offset, &imap,
- iomapp, nimaps, *niomaps, iomap_flags);
- } else if (niomaps) {
- *niomaps = 0;
- }
-
-out:
- if (lockmode)
- XFS_IUNLOCK(mp, io, lockmode);
- return XFS_ERROR(error);
-}
-
-STATIC int
-xfs_flush_space(
- xfs_inode_t *ip,
- int *fsynced,
- int *ioflags)
-{
- switch (*fsynced) {
- case 0:
- if (ip->i_delayed_blks) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_flush_inode(ip);
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- *fsynced = 1;
- } else {
- *ioflags |= BMAPI_SYNC;
- *fsynced = 2;
- }
- return 0;
- case 1:
- *fsynced = 2;
- *ioflags |= BMAPI_SYNC;
- return 0;
- case 2:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_flush_device(ip);
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- *fsynced = 3;
- return 0;
- }
- return 1;
-}
-
-int
-xfs_iomap_write_direct(
- xfs_inode_t *ip,
- loff_t offset,
- size_t count,
- int flags,
- xfs_bmbt_irec_t *ret_imap,
- int *nmaps,
- int found)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_iocore_t *io = &ip->i_iocore;
- xfs_fileoff_t offset_fsb;
- xfs_fileoff_t last_fsb;
- xfs_filblks_t count_fsb;
- xfs_fsize_t isize;
- xfs_fsblock_t firstfsb;
- int nimaps, maps;
- int error;
- int bmapi_flag;
- int rt;
- xfs_trans_t *tp;
- xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp;
- xfs_bmap_free_t free_list;
- int aeof;
- xfs_filblks_t datablocks;
- int committed;
- int numrtextents;
- uint resblks;
-
- /*
- * Make sure that the dquots are there. This doesn't hold
- * the ilock across a disk read.
- */
-
- error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED);
- if (error)
- return XFS_ERROR(error);
-
- maps = min(XFS_WRITE_IMAPS, *nmaps);
- nimaps = maps;
-
- isize = ip->i_d.di_size;
- aeof = (offset + count) > isize;
-
- if (io->io_new_size > isize)
- isize = io->io_new_size;
-
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
- count_fsb = last_fsb - offset_fsb;
- if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) {
- xfs_fileoff_t map_last_fsb;
-
- map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff;
-
- if (map_last_fsb < last_fsb) {
- last_fsb = map_last_fsb;
- count_fsb = last_fsb - offset_fsb;
- }
- ASSERT(count_fsb > 0);
- }
-
- /*
- * determine if reserving space on
- * the data or realtime partition.
- */
- if ((rt = XFS_IS_REALTIME_INODE(ip))) {
- int sbrtextsize, iprtextsize;
-
- sbrtextsize = mp->m_sb.sb_rextsize;
- iprtextsize =
- ip->i_d.di_extsize ? ip->i_d.di_extsize : sbrtextsize;
- numrtextents = (count_fsb + iprtextsize - 1);
- do_div(numrtextents, sbrtextsize);
- datablocks = 0;
- } else {
- datablocks = count_fsb;
- numrtextents = 0;
- }
-
- /*
- * allocate and setup the transaction
- */
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
-
- error = xfs_trans_reserve(tp, resblks,
- XFS_WRITE_LOG_RES(mp), numrtextents,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
-
- /*
- * check for running out of space
- */
- if (error)
- /*
- * Free the transaction structure.
- */
- xfs_trans_cancel(tp, 0);
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- if (error)
- goto error_out; /* Don't return in above if .. trans ..,
- need lock to return */
-
- if (XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, resblks)) {
- error = (EDQUOT);
- goto error1;
- }
- nimaps = 1;
-
- bmapi_flag = XFS_BMAPI_WRITE;
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
-
- if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt))
- bmapi_flag |= XFS_BMAPI_PREALLOC;
-
- /*
- * issue the bmapi() call to allocate the blocks
- */
- XFS_BMAP_INIT(&free_list, &firstfsb);
- imapp = &imap[0];
- error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
- bmapi_flag, &firstfsb, 0, imapp, &nimaps, &free_list);
- if (error) {
- goto error0;
- }
-
- /*
- * complete the transaction
- */
-
- error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
- if (error) {
- goto error0;
- }
-
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
- if (error) {
- goto error_out;
- }
-
- /* copy any maps to caller's array and return any error. */
- if (nimaps == 0) {
- error = (ENOSPC);
- goto error_out;
- }
-
- *ret_imap = imap[0];
- *nmaps = 1;
- return 0;
-
- error0: /* Cancel bmap, unlock inode, and cancel trans */
- xfs_bmap_cancel(&free_list);
-
- error1: /* Just cancel transaction */
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
- *nmaps = 0; /* nothing set-up here */
-
-error_out:
- return XFS_ERROR(error);
-}
-
-int
-xfs_iomap_write_delay(
- xfs_inode_t *ip,
- loff_t offset,
- size_t count,
- int ioflag,
- xfs_bmbt_irec_t *ret_imap,
- int *nmaps)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_iocore_t *io = &ip->i_iocore;
- xfs_fileoff_t offset_fsb;
- xfs_fileoff_t last_fsb;
- xfs_fsize_t isize;
- xfs_fsblock_t firstblock;
- int nimaps;
- int error;
- xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
- int aeof;
- int fsynced = 0;
-
- ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
-
- /*
- * Make sure that the dquots are there. This doesn't hold
- * the ilock across a disk read.
- */
-
- error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
- if (error)
- return XFS_ERROR(error);
-
-retry:
- isize = ip->i_d.di_size;
- if (io->io_new_size > isize) {
- isize = io->io_new_size;
- }
-
- aeof = 0;
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
- /*
- * If the caller is doing a write at the end of the file,
- * then extend the allocation (and the buffer used for the write)
- * out to the file system's write iosize. We clean up any extra
- * space left over when the file is closed in xfs_inactive().
- *
- * We don't bother with this for sync writes, because we need
- * to minimize the amount we write for good performance.
- */
- if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) {
- xfs_off_t aligned_offset;
- unsigned int iosize;
- xfs_fileoff_t ioalign;
-
- iosize = mp->m_writeio_blocks;
- aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
- ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
- last_fsb = ioalign + iosize;
- aeof = 1;
- }
-
- nimaps = XFS_WRITE_IMAPS;
- firstblock = NULLFSBLOCK;
-
- /*
- * roundup the allocation request to m_dalign boundary if file size
- * is greater that 512K and we are allocating past the allocation eof
- */
- if (mp->m_dalign && (isize >= mp->m_dalign) && aeof) {
- int eof;
- xfs_fileoff_t new_last_fsb;
- new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
- error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
- if (error) {
- return error;
- }
- if (eof) {
- last_fsb = new_last_fsb;
- }
- }
-
- error = xfs_bmapi(NULL, ip, offset_fsb,
- (xfs_filblks_t)(last_fsb - offset_fsb),
- XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
- XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
- &nimaps, NULL);
- /*
- * This can be EDQUOT, if nimaps == 0
- */
- if (error && (error != ENOSPC)) {
- return XFS_ERROR(error);
- }
- /*
- * If bmapi returned us nothing, and if we didn't get back EDQUOT,
- * then we must have run out of space.
- */
-
- if (nimaps == 0) {
- if (xfs_flush_space(ip, &fsynced, &ioflag))
- return XFS_ERROR(ENOSPC);
-
- error = 0;
- goto retry;
- }
-
- *ret_imap = imap[0];
- *nmaps = 1;
- return 0;
-}
-
-/*
- * Pass in a delayed allocate extent, convert it to real extents;
- * return to the caller the extent we create which maps on top of
- * the originating callers request.
- *
- * Called without a lock on the inode.
- */
-int
-xfs_iomap_write_allocate(
- xfs_inode_t *ip,
- xfs_bmbt_irec_t *map,
- int *retmap)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb, last_block;
- xfs_fileoff_t end_fsb, map_start_fsb;
- xfs_fsblock_t first_block;
- xfs_bmap_free_t free_list;
- xfs_filblks_t count_fsb;
- xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS];
- xfs_trans_t *tp;
- int i, nimaps, committed;
- int error = 0;
- int nres;
-
- *retmap = 0;
-
- /*
- * Make sure that the dquots are there.
- */
-
- if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
- return XFS_ERROR(error);
-
- offset_fsb = map->br_startoff;
- count_fsb = map->br_blockcount;
- map_start_fsb = offset_fsb;
-
- XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
-
- while (count_fsb != 0) {
- /*
- * Set up a transaction with which to allocate the
- * backing store for the file. Do allocations in a
- * loop until we get some space in the range we are
- * interested in. The other space that might be allocated
- * is in the delayed allocation extent on which we sit
- * but before our buffer starts.
- */
-
- nimaps = 0;
- while (nimaps == 0) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
- nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- error = xfs_trans_reserve(tp, nres,
- XFS_WRITE_LOG_RES(mp),
- 0, XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
-
- if (error == ENOSPC) {
- error = xfs_trans_reserve(tp, 0,
- XFS_WRITE_LOG_RES(mp),
- 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
- }
- if (error) {
- xfs_trans_cancel(tp, 0);
- return XFS_ERROR(error);
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
-
- XFS_BMAP_INIT(&free_list, &first_block);
-
- nimaps = XFS_STRAT_WRITE_IMAPS;
- /*
- * Ensure we don't go beyond eof - it is possible
- * the extents changed since we did the read call,
- * we dropped the ilock in the interim.
- */
-
- end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size);
- xfs_bmap_last_offset(NULL, ip, &last_block,
- XFS_DATA_FORK);
- last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
- if ((map_start_fsb + count_fsb) > last_block) {
- count_fsb = last_block - map_start_fsb;
- if (count_fsb == 0) {
- error = EAGAIN;
- goto trans_cancel;
- }
- }
-
- /* Go get the actual blocks */
- error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
- XFS_BMAPI_WRITE, &first_block, 1,
- imap, &nimaps, &free_list);
-
- if (error)
- goto trans_cancel;
-
- error = xfs_bmap_finish(&tp, &free_list,
- first_block, &committed);
-
- if (error)
- goto trans_cancel;
-
- error = xfs_trans_commit(tp,
- XFS_TRANS_RELEASE_LOG_RES, NULL);
-
- if (error)
- goto error0;
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
- /*
- * See if we were able to allocate an extent that
- * covers at least part of the callers request
- */
-
- for (i = 0; i < nimaps; i++) {
- if ((map->br_startoff >= imap[i].br_startoff) &&
- (map->br_startoff < (imap[i].br_startoff +
- imap[i].br_blockcount))) {
- *map = imap[i];
- *retmap = 1;
- XFS_STATS_INC(xs_xstrat_quick);
- return 0;
- }
- count_fsb -= imap[i].br_blockcount;
- }
-
- /* So far we have not mapped the requested part of the
- * file, just surrounding data, try again.
- */
- nimaps--;
- offset_fsb = imap[nimaps].br_startoff +
- imap[nimaps].br_blockcount;
- map_start_fsb = offset_fsb;
- }
-
-trans_cancel:
- xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-error0:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return XFS_ERROR(error);
-}
-
-int
-xfs_iomap_write_unwritten(
- xfs_inode_t *ip,
- loff_t offset,
- size_t count)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_trans_t *tp;
- xfs_fileoff_t offset_fsb;
- xfs_filblks_t count_fsb;
- xfs_filblks_t numblks_fsb;
- xfs_bmbt_irec_t imap;
- int committed;
- int error;
- int nres;
- int nimaps;
- xfs_fsblock_t firstfsb;
- xfs_bmap_free_t free_list;
-
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- count_fsb = XFS_B_TO_FSB(mp, count);
-
- do {
- nres = XFS_DIOSTRAT_SPACE_RES(mp, 0);
-
- /*
- * set up a transaction to convert the range of extents
- * from unwritten to real. Do allocations in a loop until
- * we have covered the range passed in.
- */
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
- error = xfs_trans_reserve(tp, nres,
- XFS_WRITE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES,
- XFS_WRITE_LOG_COUNT);
- if (error) {
- xfs_trans_cancel(tp, 0);
- goto error0;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
-
- /*
- * Modify the unwritten extent state of the buffer.
- */
- XFS_BMAP_INIT(&free_list, &firstfsb);
- nimaps = 1;
- error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_WRITE, &firstfsb,
- 1, &imap, &nimaps, &free_list);
- if (error)
- goto error_on_bmapi_transaction;
-
- error = xfs_bmap_finish(&(tp), &(free_list),
- firstfsb, &committed);
- if (error)
- goto error_on_bmapi_transaction;
-
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- goto error0;
-
- if ((numblks_fsb = imap.br_blockcount) == 0) {
- /*
- * The numblks_fsb value should always get
- * smaller, otherwise the loop is stuck.
- */
- ASSERT(imap.br_blockcount);
- break;
- }
- offset_fsb += numblks_fsb;
- count_fsb -= numblks_fsb;
- } while (count_fsb > 0);
-
- return 0;
-
-error_on_bmapi_transaction:
- xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-error0:
- return XFS_ERROR(error);
-}
#include <asm/byteorder.h>
#include <asm/unaligned.h>
-#include <linux/xfs_behavior.h>
-#include <linux/xfs_vfs.h>
-#include <linux/xfs_cred.h>
-#include <linux/xfs_vnode.h>
-#include <linux/xfs_stats.h>
-#include <linux/xfs_sysctl.h>
-#include <linux/xfs_iops.h>
-#include <linux/xfs_super.h>
-#include <linux/xfs_globals.h>
-#include <linux/xfs_fs_subr.h>
-#include <linux/xfs_lrw.h>
-
-#include <pagebuf/page_buf.h>
+#include <mrlock.h>
+#include <spin.h>
+#include <sv.h>
+#include <mutex.h>
+#include <sema.h>
+#include <time.h>
+#include <kmem.h>
+
+#include <xfs_behavior.h>
+#include <xfs_vfs.h>
+#include <xfs_cred.h>
+#include <xfs_vnode.h>
+#include <xfs_stats.h>
+#include <xfs_sysctl.h>
+#include <xfs_iops.h>
+#include <xfs_super.h>
+#include <xfs_globals.h>
+#include <xfs_fs_subr.h>
+#include <xfs_lrw.h>
+#include <xfs_buf.h>
/*
* Feature macros (disable/enable)
+++ /dev/null
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-
-/*
- * page_buf.c
- *
- * The page_buf module provides an abstract buffer cache model on top of
- * the Linux page cache. Cached metadata blocks for a file system are
- * hashed to the inode for the block device. The page_buf module
- * assembles buffer (page_buf_t) objects on demand to aggregate such
- * cached pages for I/O.
- *
- *
- * Written by Steve Lord, Jim Mostek, Russell Cattelan
- * and Rajagopal Ananthanarayanan ("ananth") at SGI.
- *
- */
-
-#include <linux/module.h>
-#include <linux/stddef.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
-#include <linux/bio.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/workqueue.h>
-#include <linux/suspend.h>
-#include <linux/percpu.h>
-
-#include <support/ktrace.h>
-#include <support/debug.h>
-#include <support/kmem.h>
-
-#include "page_buf.h"
-
-#define BBSHIFT 9
-#define BN_ALIGN_MASK ((1 << (PAGE_CACHE_SHIFT - BBSHIFT)) - 1)
-
-#ifndef GFP_READAHEAD
-#define GFP_READAHEAD (__GFP_NOWARN|__GFP_NORETRY)
-#endif
-
-/*
- * File wide globals
- */
-
-STATIC kmem_cache_t *pagebuf_cache;
-STATIC void pagebuf_daemon_wakeup(int);
-STATIC void pagebuf_delwri_queue(page_buf_t *, int);
-STATIC struct workqueue_struct *pagebuf_logio_workqueue;
-STATIC struct workqueue_struct *pagebuf_dataio_workqueue;
-
-/*
- * Pagebuf module configuration parameters, exported via
- * /proc/sys/vm/pagebuf
- */
-
-typedef struct pb_sysctl_val {
- int min;
- int val;
- int max;
-} pb_sysctl_val_t;
-
-struct {
- pb_sysctl_val_t flush_interval; /* interval between runs of the
- * delwri flush daemon. */
- pb_sysctl_val_t age_buffer; /* time for buffer to age before
- * we flush it. */
- pb_sysctl_val_t stats_clear; /* clear the pagebuf stats */
- pb_sysctl_val_t debug; /* debug tracing on or off */
-} pb_params = {
- /* MIN DFLT MAX */
- .flush_interval = { HZ/2, HZ, 30*HZ },
- .age_buffer = { 1*HZ, 15*HZ, 300*HZ },
- .stats_clear = { 0, 0, 1 },
- .debug = { 0, 0, 1 },
-};
-
-enum {
- PB_FLUSH_INT = 1,
- PB_FLUSH_AGE = 2,
- PB_STATS_CLEAR = 3,
- PB_DEBUG = 4,
-};
-
-/*
- * Pagebuf statistics variables
- */
-
-struct pbstats {
- u_int32_t pb_get;
- u_int32_t pb_create;
- u_int32_t pb_get_locked;
- u_int32_t pb_get_locked_waited;
- u_int32_t pb_busy_locked;
- u_int32_t pb_miss_locked;
- u_int32_t pb_page_retries;
- u_int32_t pb_page_found;
- u_int32_t pb_get_read;
-} pbstats;
-DEFINE_PER_CPU(struct pbstats, pbstats);
-
-/* We don't disable preempt, not too worried about poking the
- * wrong cpu's stat for now */
-#define PB_STATS_INC(count) (__get_cpu_var(pbstats).count++)
-
-/*
- * Pagebuf debugging
- */
-
-#ifdef PAGEBUF_TRACE
-void
-pagebuf_trace(
- page_buf_t *pb,
- char *id,
- void *data,
- void *ra)
-{
- if (!pb_params.debug.val)
- return;
- ktrace_enter(pagebuf_trace_buf,
- pb, id,
- (void *)(unsigned long)pb->pb_flags,
- (void *)(unsigned long)pb->pb_hold.counter,
- (void *)(unsigned long)pb->pb_sema.count.counter,
- (void *)current,
- data, ra,
- (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
- (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
- (void *)(unsigned long)pb->pb_buffer_length,
- NULL, NULL, NULL, NULL, NULL);
-}
-ktrace_t *pagebuf_trace_buf;
-EXPORT_SYMBOL(pagebuf_trace_buf);
-#define PAGEBUF_TRACE_SIZE 4096
-#define PB_TRACE(pb, id, data) \
- pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))
-#else
-#define PB_TRACE(pb, id, data) do { } while (0)
-#endif
-
-#ifdef PAGEBUF_LOCK_TRACKING
-# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid)
-# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1)
-# define PB_GET_OWNER(pb) ((pb)->pb_last_holder)
-#else
-# define PB_SET_OWNER(pb) do { } while (0)
-# define PB_CLEAR_OWNER(pb) do { } while (0)
-# define PB_GET_OWNER(pb) do { } while (0)
-#endif
-
-/*
- * Pagebuf allocation / freeing.
- */
-
-#define pb_to_gfp(flags) \
- (((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \
- ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL)
-
-#define pagebuf_allocate(flags) \
- kmem_cache_alloc(pagebuf_cache, pb_to_gfp(flags))
-#define pagebuf_deallocate(pb) \
- kmem_cache_free(pagebuf_cache, (pb));
-
-/*
- * Pagebuf hashing
- */
-
-#define NBITS 8
-#define NHASH (1<<NBITS)
-
-typedef struct {
- struct list_head pb_hash;
- int pb_count;
- spinlock_t pb_hash_lock;
-} pb_hash_t;
-
-STATIC pb_hash_t pbhash[NHASH];
-#define pb_hash(pb) &pbhash[pb->pb_hash_index]
-
-STATIC int
-_bhash(
- struct block_device *bdev,
- loff_t base)
-{
- int bit, hval;
-
- base >>= 9;
- base ^= (unsigned long)bdev / L1_CACHE_BYTES;
- for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) {
- hval ^= (int)base & (NHASH-1);
- base >>= NBITS;
- }
- return hval;
-}
-
-/*
- * Mapping of multi-page buffers into contiguous virtual space
- */
-
-STATIC void *pagebuf_mapout_locked(page_buf_t *);
-
-typedef struct a_list {
- void *vm_addr;
- struct a_list *next;
-} a_list_t;
-
-STATIC a_list_t *as_free_head;
-STATIC int as_list_len;
-STATIC spinlock_t as_lock = SPIN_LOCK_UNLOCKED;
-
-/*
- * Try to batch vunmaps because they are costly.
- */
-STATIC void
-free_address(
- void *addr)
-{
- a_list_t *aentry;
-
- aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC);
- if (aentry) {
- spin_lock(&as_lock);
- aentry->next = as_free_head;
- aentry->vm_addr = addr;
- as_free_head = aentry;
- as_list_len++;
- spin_unlock(&as_lock);
- } else {
- vunmap(addr);
- }
-}
-
-STATIC void
-purge_addresses(void)
-{
- a_list_t *aentry, *old;
-
- if (as_free_head == NULL)
- return;
-
- spin_lock(&as_lock);
- aentry = as_free_head;
- as_free_head = NULL;
- as_list_len = 0;
- spin_unlock(&as_lock);
-
- while ((old = aentry) != NULL) {
- vunmap(aentry->vm_addr);
- aentry = aentry->next;
- kfree(old);
- }
-}
-
-/*
- * Internal pagebuf object manipulation
- */
-
-STATIC void
-_pagebuf_initialize(
- page_buf_t *pb,
- pb_target_t *target,
- loff_t range_base,
- size_t range_length,
- page_buf_flags_t flags)
-{
- /*
- * We don't want certain flags to appear in pb->pb_flags.
- */
- flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD);
-
- memset(pb, 0, sizeof(page_buf_t));
- atomic_set(&pb->pb_hold, 1);
- init_MUTEX_LOCKED(&pb->pb_iodonesema);
- INIT_LIST_HEAD(&pb->pb_list);
- INIT_LIST_HEAD(&pb->pb_hash_list);
- init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
- PB_SET_OWNER(pb);
- pb->pb_target = target;
- pb->pb_file_offset = range_base;
- /*
- * Set buffer_length and count_desired to the same value initially.
- * IO routines should use count_desired, which will be the same in
- * most cases but may be reset (e.g. XFS recovery).
- */
- pb->pb_buffer_length = pb->pb_count_desired = range_length;
- pb->pb_flags = flags | PBF_NONE;
- pb->pb_bn = PAGE_BUF_DADDR_NULL;
- atomic_set(&pb->pb_pin_count, 0);
- init_waitqueue_head(&pb->pb_waiters);
-
- PB_STATS_INC(pb_create);
- PB_TRACE(pb, "initialize", target);
-}
-
-/*
- * Allocate a page array capable of holding a specified number
- * of pages, and point the page buf at it.
- */
-STATIC int
-_pagebuf_get_pages(
- page_buf_t *pb,
- int page_count,
- page_buf_flags_t flags)
-{
- int gpf_mask = pb_to_gfp(flags);
-
- /* Make sure that we have a page list */
- if (pb->pb_pages == NULL) {
- pb->pb_offset = page_buf_poff(pb->pb_file_offset);
- pb->pb_page_count = page_count;
- if (page_count <= PB_PAGES) {
- pb->pb_pages = pb->pb_page_array;
- } else {
- pb->pb_pages = kmalloc(sizeof(struct page *) *
- page_count, gpf_mask);
- if (pb->pb_pages == NULL)
- return -ENOMEM;
- }
- memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
- }
- return 0;
-}
-
-/*
- * Walk a pagebuf releasing all the pages contained within it.
- */
-STATIC inline void
-_pagebuf_freepages(
- page_buf_t *pb)
-{
- int buf_index;
-
- for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) {
- struct page *page = pb->pb_pages[buf_index];
-
- if (page) {
- pb->pb_pages[buf_index] = NULL;
- page_cache_release(page);
- }
- }
-}
-
-/*
- * _pagebuf_free_object
- *
- * _pagebuf_free_object releases the contents specified buffer.
- * The modification state of any associated pages is left unchanged.
- */
-void
-_pagebuf_free_object(
- pb_hash_t *hash, /* hash bucket for buffer */
- page_buf_t *pb) /* buffer to deallocate */
-{
- page_buf_flags_t pb_flags = pb->pb_flags;
-
- PB_TRACE(pb, "free_object", 0);
- pb->pb_flags |= PBF_FREED;
-
- if (hash) {
- if (!list_empty(&pb->pb_hash_list)) {
- hash->pb_count--;
- list_del_init(&pb->pb_hash_list);
- }
- spin_unlock(&hash->pb_hash_lock);
- }
-
- if (!(pb_flags & PBF_FREED)) {
- /* release any virtual mapping */ ;
- if (pb->pb_flags & _PBF_ADDR_ALLOCATED) {
- void *vaddr = pagebuf_mapout_locked(pb);
- if (vaddr) {
- free_address(vaddr);
- }
- }
-
- if (pb->pb_flags & _PBF_MEM_ALLOCATED) {
- if (pb->pb_pages) {
- /* release the pages in the address list */
- if ((pb->pb_pages[0]) &&
- (pb->pb_flags & _PBF_MEM_SLAB)) {
- kfree(pb->pb_addr);
- } else {
- _pagebuf_freepages(pb);
- }
- if (pb->pb_pages != pb->pb_page_array)
- kfree(pb->pb_pages);
- pb->pb_pages = NULL;
- }
- pb->pb_flags &= ~(_PBF_MEM_ALLOCATED|_PBF_MEM_SLAB);
- }
- }
-
- pagebuf_deallocate(pb);
-}
-
-/*
- * _pagebuf_lookup_pages
- *
- * _pagebuf_lookup_pages finds all pages which match the buffer
- * in question and the range of file offsets supplied,
- * and builds the page list for the buffer, if the
- * page list is not already formed or if not all of the pages are
- * already in the list. Invalid pages (pages which have not yet been
- * read in from disk) are assigned for any pages which are not found.
- */
-STATIC int
-_pagebuf_lookup_pages(
- page_buf_t *pb,
- struct address_space *aspace,
- page_buf_flags_t flags)
-{
- loff_t next_buffer_offset;
- unsigned long page_count, pi, index;
- struct page *page;
- int gfp_mask, retry_count = 5, rval = 0;
- int all_mapped, good_pages, nbytes;
- unsigned int blocksize, sectorshift;
- size_t size, offset;
-
-
- /* For pagebufs where we want to map an address, do not use
- * highmem pages - so that we do not need to use kmap resources
- * to access the data.
- *
- * For pages where the caller has indicated there may be resource
- * contention (e.g. called from a transaction) do not flush
- * delalloc pages to obtain memory.
- */
-
- if (flags & PBF_READ_AHEAD) {
- gfp_mask = GFP_READAHEAD;
- retry_count = 0;
- } else if (flags & PBF_DONT_BLOCK) {
- gfp_mask = GFP_NOFS;
- } else if (flags & PBF_MAPPABLE) {
- gfp_mask = GFP_KERNEL;
- } else {
- gfp_mask = GFP_HIGHUSER;
- }
-
- next_buffer_offset = pb->pb_file_offset + pb->pb_buffer_length;
-
- good_pages = page_count = (page_buf_btoc(next_buffer_offset) -
- page_buf_btoct(pb->pb_file_offset));
-
- if (pb->pb_flags & _PBF_ALL_PAGES_MAPPED) {
- /* Bring pages forward in cache */
- for (pi = 0; pi < page_count; pi++) {
- mark_page_accessed(pb->pb_pages[pi]);
- }
- if ((flags & PBF_MAPPED) && !(pb->pb_flags & PBF_MAPPED)) {
- all_mapped = 1;
- goto mapit;
- }
- return 0;
- }
-
- /* Ensure pb_pages field has been initialised */
- rval = _pagebuf_get_pages(pb, page_count, flags);
- if (rval)
- return rval;
-
- rval = pi = 0;
- blocksize = pb->pb_target->pbr_bsize;
- sectorshift = pb->pb_target->pbr_sshift;
- size = pb->pb_count_desired;
- offset = pb->pb_offset;
-
- /* Enter the pages in the page list */
- index = (pb->pb_file_offset - pb->pb_offset) >> PAGE_CACHE_SHIFT;
- for (all_mapped = 1; pi < page_count; pi++, index++) {
- if (pb->pb_pages[pi] == 0) {
- retry:
- page = find_or_create_page(aspace, index, gfp_mask);
- if (!page) {
- if (--retry_count > 0) {
- PB_STATS_INC(pb_page_retries);
- pagebuf_daemon_wakeup(1);
- current->state = TASK_UNINTERRUPTIBLE;
- schedule_timeout(10);
- goto retry;
- }
- rval = -ENOMEM;
- all_mapped = 0;
- continue;
- }
- PB_STATS_INC(pb_page_found);
- mark_page_accessed(page);
- pb->pb_pages[pi] = page;
- } else {
- page = pb->pb_pages[pi];
- lock_page(page);
- }
-
- nbytes = PAGE_CACHE_SIZE - offset;
- if (nbytes > size)
- nbytes = size;
- size -= nbytes;
-
- if (!PageUptodate(page)) {
- if (blocksize == PAGE_CACHE_SIZE) {
- if (flags & PBF_READ)
- pb->pb_locked = 1;
- good_pages--;
- } else if (!PagePrivate(page)) {
- unsigned long i, range;
-
- /*
- * In this case page->private holds a bitmap
- * of uptodate sectors within the page
- */
- ASSERT(blocksize < PAGE_CACHE_SIZE);
- range = (offset + nbytes) >> sectorshift;
- for (i = offset >> sectorshift; i < range; i++)
- if (!test_bit(i, &page->private))
- break;
- if (i != range)
- good_pages--;
- } else {
- good_pages--;
- }
- }
- offset = 0;
- }
-
- if (!pb->pb_locked) {
- for (pi = 0; pi < page_count; pi++) {
- if (pb->pb_pages[pi])
- unlock_page(pb->pb_pages[pi]);
- }
- }
-
-mapit:
- pb->pb_flags |= _PBF_MEM_ALLOCATED;
- if (all_mapped) {
- pb->pb_flags |= _PBF_ALL_PAGES_MAPPED;
-
- /* A single page buffer is always mappable */
- if (page_count == 1) {
- pb->pb_addr = (caddr_t)
- page_address(pb->pb_pages[0]) + pb->pb_offset;
- pb->pb_flags |= PBF_MAPPED;
- } else if (flags & PBF_MAPPED) {
- if (as_list_len > 64)
- purge_addresses();
- pb->pb_addr = vmap(pb->pb_pages, page_count,
- VM_MAP, PAGE_KERNEL);
- if (pb->pb_addr == NULL)
- return -ENOMEM;
- pb->pb_addr += pb->pb_offset;
- pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED;
- }
- }
- /* If some pages were found with data in them
- * we are not in PBF_NONE state.
- */
- if (good_pages != 0) {
- pb->pb_flags &= ~(PBF_NONE);
- if (good_pages != page_count) {
- pb->pb_flags |= PBF_PARTIAL;
- }
- }
-
- PB_TRACE(pb, "lookup_pages", (long)good_pages);
-
- return rval;
-}
-
-/*
- * Finding and Reading Buffers
- */
-
-/*
- * _pagebuf_find
- *
- * Looks up, and creates if absent, a lockable buffer for
- * a given range of an inode. The buffer is returned
- * locked. If other overlapping buffers exist, they are
- * released before the new buffer is created and locked,
- * which may imply that this call will block until those buffers
- * are unlocked. No I/O is implied by this call.
- */
-STATIC page_buf_t *
-_pagebuf_find( /* find buffer for block */
- pb_target_t *target,/* target for block */
- loff_t ioff, /* starting offset of range */
- size_t isize, /* length of range */
- page_buf_flags_t flags, /* PBF_TRYLOCK */
- page_buf_t *new_pb)/* newly allocated buffer */
-{
- loff_t range_base;
- size_t range_length;
- int hval;
- pb_hash_t *h;
- struct list_head *p;
- page_buf_t *pb;
- int not_locked;
-
- range_base = (ioff << BBSHIFT);
- range_length = (isize << BBSHIFT);
-
- /* Ensure we never do IOs smaller than the sector size */
- BUG_ON(range_length < (1 << target->pbr_sshift));
-
- /* Ensure we never do IOs that are not sector aligned */
- BUG_ON(range_base & (loff_t)target->pbr_smask);
-
- hval = _bhash(target->pbr_bdev, range_base);
- h = &pbhash[hval];
-
- spin_lock(&h->pb_hash_lock);
- list_for_each(p, &h->pb_hash) {
- pb = list_entry(p, page_buf_t, pb_hash_list);
-
- if ((target == pb->pb_target) &&
- (pb->pb_file_offset == range_base) &&
- (pb->pb_buffer_length == range_length)) {
- if (pb->pb_flags & PBF_FREED)
- break;
- /* If we look at something bring it to the
- * front of the list for next time
- */
- list_del(&pb->pb_hash_list);
- list_add(&pb->pb_hash_list, &h->pb_hash);
- goto found;
- }
- }
-
- /* No match found */
- if (new_pb) {
- _pagebuf_initialize(new_pb, target, range_base,
- range_length, flags | _PBF_LOCKABLE);
- new_pb->pb_hash_index = hval;
- h->pb_count++;
- list_add(&new_pb->pb_hash_list, &h->pb_hash);
- } else {
- PB_STATS_INC(pb_miss_locked);
- }
-
- spin_unlock(&h->pb_hash_lock);
- return (new_pb);
-
-found:
- atomic_inc(&pb->pb_hold);
- spin_unlock(&h->pb_hash_lock);
-
- /* Attempt to get the semaphore without sleeping,
- * if this does not work then we need to drop the
- * spinlock and do a hard attempt on the semaphore.
- */
- not_locked = down_trylock(&pb->pb_sema);
- if (not_locked) {
- if (!(flags & PBF_TRYLOCK)) {
- /* wait for buffer ownership */
- PB_TRACE(pb, "get_lock", 0);
- pagebuf_lock(pb);
- PB_STATS_INC(pb_get_locked_waited);
- } else {
- /* We asked for a trylock and failed, no need
- * to look at file offset and length here, we
- * know that this pagebuf at least overlaps our
- * pagebuf and is locked, therefore our buffer
- * either does not exist, or is this buffer
- */
-
- pagebuf_rele(pb);
- PB_STATS_INC(pb_busy_locked);
- return (NULL);
- }
- } else {
- /* trylock worked */
- PB_SET_OWNER(pb);
- }
-
- if (pb->pb_flags & PBF_STALE)
- pb->pb_flags &= PBF_MAPPABLE | \
- PBF_MAPPED | \
- _PBF_LOCKABLE | \
- _PBF_ALL_PAGES_MAPPED | \
- _PBF_ADDR_ALLOCATED | \
- _PBF_MEM_ALLOCATED | \
- _PBF_MEM_SLAB;
- PB_TRACE(pb, "got_lock", 0);
- PB_STATS_INC(pb_get_locked);
- return (pb);
-}
-
-
-/*
- * pagebuf_find
- *
- * pagebuf_find returns a buffer matching the specified range of
- * data for the specified target, if any of the relevant blocks
- * are in memory. The buffer may have unallocated holes, if
- * some, but not all, of the blocks are in memory. Even where
- * pages are present in the buffer, not all of every page may be
- * valid.
- */
-page_buf_t *
-pagebuf_find( /* find buffer for block */
- /* if the block is in memory */
- pb_target_t *target,/* target for block */
- loff_t ioff, /* starting offset of range */
- size_t isize, /* length of range */
- page_buf_flags_t flags) /* PBF_TRYLOCK */
-{
- return _pagebuf_find(target, ioff, isize, flags, NULL);
-}
-
-/*
- * pagebuf_get
- *
- * pagebuf_get assembles a buffer covering the specified range.
- * Some or all of the blocks in the range may be valid. Storage
- * in memory for all portions of the buffer will be allocated,
- * although backing storage may not be. If PBF_READ is set in
- * flags, pagebuf_iostart is called also.
- */
-page_buf_t *
-pagebuf_get( /* allocate a buffer */
- pb_target_t *target,/* target for buffer */
- loff_t ioff, /* starting offset of range */
- size_t isize, /* length of range */
- page_buf_flags_t flags) /* PBF_TRYLOCK */
-{
- page_buf_t *pb, *new_pb;
- int error;
-
- new_pb = pagebuf_allocate(flags);
- if (unlikely(!new_pb))
- return (NULL);
-
- pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
- if (pb != new_pb) {
- pagebuf_deallocate(new_pb);
- if (unlikely(!pb))
- return (NULL);
- }
-
- PB_STATS_INC(pb_get);
-
- /* fill in any missing pages */
- error = _pagebuf_lookup_pages(pb, pb->pb_target->pbr_mapping, flags);
- if (unlikely(error)) {
- pagebuf_free(pb);
- return (NULL);
- }
-
- /*
- * Always fill in the block number now, the mapped cases can do
- * their own overlay of this later.
- */
- pb->pb_bn = ioff;
- pb->pb_count_desired = pb->pb_buffer_length;
-
- if (flags & PBF_READ) {
- if (PBF_NOT_DONE(pb)) {
- PB_TRACE(pb, "get_read", (unsigned long)flags);
- PB_STATS_INC(pb_get_read);
- pagebuf_iostart(pb, flags);
- } else if (flags & PBF_ASYNC) {
- PB_TRACE(pb, "get_read_async", (unsigned long)flags);
- /*
- * Read ahead call which is already satisfied,
- * drop the buffer
- */
- if (flags & (PBF_LOCK | PBF_TRYLOCK))
- pagebuf_unlock(pb);
- pagebuf_rele(pb);
- return NULL;
- } else {
- PB_TRACE(pb, "get_read_done", (unsigned long)flags);
- /* We do not want read in the flags */
- pb->pb_flags &= ~PBF_READ;
- }
- } else {
- PB_TRACE(pb, "get_write", (unsigned long)flags);
- }
- return (pb);
-}
-
-/*
- * Create a skeletal pagebuf (no pages associated with it).
- */
-page_buf_t *
-pagebuf_lookup(
- struct pb_target *target,
- loff_t ioff,
- size_t isize,
- page_buf_flags_t flags)
-{
- page_buf_t *pb;
-
- pb = pagebuf_allocate(flags);
- if (pb) {
- _pagebuf_initialize(pb, target, ioff, isize, flags);
- }
- return pb;
-}
-
-/*
- * If we are not low on memory then do the readahead in a deadlock
- * safe manner.
- */
-void
-pagebuf_readahead(
- pb_target_t *target,
- loff_t ioff,
- size_t isize,
- page_buf_flags_t flags)
-{
- struct backing_dev_info *bdi;
-
- bdi = target->pbr_mapping->backing_dev_info;
- if (bdi_read_congested(bdi))
- return;
- if (bdi_write_congested(bdi))
- return;
-
- flags |= (PBF_TRYLOCK|PBF_READ|PBF_ASYNC|PBF_MAPPABLE|PBF_READ_AHEAD);
- pagebuf_get(target, ioff, isize, flags);
-}
-
-page_buf_t *
-pagebuf_get_empty(
- size_t len,
- pb_target_t *target)
-{
- page_buf_t *pb;
-
- pb = pagebuf_allocate(_PBF_LOCKABLE);
- if (pb)
- _pagebuf_initialize(pb, target, 0, len, _PBF_LOCKABLE);
- return pb;
-}
-
-static inline struct page *
-mem_to_page(
- void *addr)
-{
- if (((unsigned long)addr < VMALLOC_START) ||
- ((unsigned long)addr >= VMALLOC_END)) {
- return virt_to_page(addr);
- } else {
- return vmalloc_to_page(addr);
- }
-}
-
-int
-pagebuf_associate_memory(
- page_buf_t *pb,
- void *mem,
- size_t len)
-{
- int rval;
- int i = 0;
- size_t ptr;
- size_t end, end_cur;
- off_t offset;
- int page_count;
-
- page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
- offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
- if (offset && (len > PAGE_CACHE_SIZE))
- page_count++;
-
- /* Free any previous set of page pointers */
- if (pb->pb_pages && (pb->pb_pages != pb->pb_page_array)) {
- kfree(pb->pb_pages);
- }
- pb->pb_pages = NULL;
- pb->pb_addr = mem;
-
- rval = _pagebuf_get_pages(pb, page_count, 0);
- if (rval)
- return rval;
-
- pb->pb_offset = offset;
- ptr = (size_t) mem & PAGE_CACHE_MASK;
- end = PAGE_CACHE_ALIGN((size_t) mem + len);
- end_cur = end;
- /* set up first page */
- pb->pb_pages[0] = mem_to_page(mem);
-
- ptr += PAGE_CACHE_SIZE;
- pb->pb_page_count = ++i;
- while (ptr < end) {
- pb->pb_pages[i] = mem_to_page((void *)ptr);
- pb->pb_page_count = ++i;
- ptr += PAGE_CACHE_SIZE;
- }
- pb->pb_locked = 0;
-
- pb->pb_count_desired = pb->pb_buffer_length = len;
- pb->pb_flags |= PBF_MAPPED;
-
- return 0;
-}
-
-page_buf_t *
-pagebuf_get_no_daddr(
- size_t len,
- pb_target_t *target)
-{
- int rval;
- void *rmem = NULL;
- page_buf_flags_t flags = _PBF_LOCKABLE | PBF_FORCEIO;
- page_buf_t *pb;
- size_t tlen = 0;
-
- if (unlikely(len > 0x20000))
- return NULL;
-
- pb = pagebuf_allocate(flags);
- if (!pb)
- return NULL;
-
- _pagebuf_initialize(pb, target, 0, len, flags);
-
- do {
- if (tlen == 0) {
- tlen = len; /* first time */
- } else {
- kfree(rmem); /* free the mem from the previous try */
- tlen <<= 1; /* double the size and try again */
- }
- if ((rmem = kmalloc(tlen, GFP_KERNEL)) == 0) {
- pagebuf_free(pb);
- return NULL;
- }
- } while ((size_t)rmem != ((size_t)rmem & ~target->pbr_smask));
-
- if ((rval = pagebuf_associate_memory(pb, rmem, len)) != 0) {
- kfree(rmem);
- pagebuf_free(pb);
- return NULL;
- }
- /* otherwise pagebuf_free just ignores it */
- pb->pb_flags |= (_PBF_MEM_ALLOCATED | _PBF_MEM_SLAB);
- PB_CLEAR_OWNER(pb);
- up(&pb->pb_sema); /* Return unlocked pagebuf */
-
- PB_TRACE(pb, "no_daddr", rmem);
-
- return pb;
-}
-
-
-/*
- * pagebuf_hold
- *
- * Increment reference count on buffer, to hold the buffer concurrently
- * with another thread which may release (free) the buffer asynchronously.
- *
- * Must hold the buffer already to call this function.
- */
-void
-pagebuf_hold(
- page_buf_t *pb)
-{
- atomic_inc(&pb->pb_hold);
- PB_TRACE(pb, "hold", 0);
-}
-
-/*
- * pagebuf_free
- *
- * pagebuf_free releases the specified buffer. The modification
- * state of any associated pages is left unchanged.
- */
-void
-pagebuf_free(
- page_buf_t *pb)
-{
- if (pb->pb_flags & _PBF_LOCKABLE) {
- pb_hash_t *h = pb_hash(pb);
-
- spin_lock(&h->pb_hash_lock);
- _pagebuf_free_object(h, pb);
- } else {
- _pagebuf_free_object(NULL, pb);
- }
-}
-
-/*
- * pagebuf_rele
- *
- * pagebuf_rele releases a hold on the specified buffer. If the
- * the hold count is 1, pagebuf_rele calls pagebuf_free.
- */
-void
-pagebuf_rele(
- page_buf_t *pb)
-{
- pb_hash_t *h;
-
- PB_TRACE(pb, "rele", pb->pb_relse);
- if (pb->pb_flags & _PBF_LOCKABLE) {
- h = pb_hash(pb);
- spin_lock(&h->pb_hash_lock);
- } else {
- h = NULL;
- }
-
- if (atomic_dec_and_test(&pb->pb_hold)) {
- int do_free = 1;
-
- if (pb->pb_relse) {
- atomic_inc(&pb->pb_hold);
- if (h)
- spin_unlock(&h->pb_hash_lock);
- (*(pb->pb_relse)) (pb);
- do_free = 0;
- }
- if (pb->pb_flags & PBF_DELWRI) {
- pb->pb_flags |= PBF_ASYNC;
- atomic_inc(&pb->pb_hold);
- if (h && do_free)
- spin_unlock(&h->pb_hash_lock);
- pagebuf_delwri_queue(pb, 0);
- do_free = 0;
- } else if (pb->pb_flags & PBF_FS_MANAGED) {
- if (h)
- spin_unlock(&h->pb_hash_lock);
- do_free = 0;
- }
-
- if (do_free) {
- _pagebuf_free_object(h, pb);
- }
- } else if (h) {
- spin_unlock(&h->pb_hash_lock);
- }
-}
-
-
-/*
- * Mutual exclusion on buffers. Locking model:
- *
- * Buffers associated with inodes for which buffer locking
- * is not enabled are not protected by semaphores, and are
- * assumed to be exclusively owned by the caller. There is a
- * spinlock in the buffer, used by the caller when concurrent
- * access is possible.
- */
-
-/*
- * pagebuf_cond_lock
- *
- * pagebuf_cond_lock locks a buffer object, if it is not already locked.
- * Note that this in no way
- * locks the underlying pages, so it is only useful for synchronizing
- * concurrent use of page buffer objects, not for synchronizing independent
- * access to the underlying pages.
- */
-int
-pagebuf_cond_lock( /* lock buffer, if not locked */
- /* returns -EBUSY if locked) */
- page_buf_t *pb)
-{
- int locked;
-
- ASSERT(pb->pb_flags & _PBF_LOCKABLE);
- locked = down_trylock(&pb->pb_sema) == 0;
- if (locked) {
- PB_SET_OWNER(pb);
- }
- PB_TRACE(pb, "cond_lock", (long)locked);
- return(locked ? 0 : -EBUSY);
-}
-
-/*
- * pagebuf_lock_value
- *
- * Return lock value for a pagebuf
- */
-int
-pagebuf_lock_value(
- page_buf_t *pb)
-{
- ASSERT(pb->pb_flags & _PBF_LOCKABLE);
- return(atomic_read(&pb->pb_sema.count));
-}
-
-/*
- * pagebuf_lock
- *
- * pagebuf_lock locks a buffer object. Note that this in no way
- * locks the underlying pages, so it is only useful for synchronizing
- * concurrent use of page buffer objects, not for synchronizing independent
- * access to the underlying pages.
- */
-int
-pagebuf_lock(
- page_buf_t *pb)
-{
- ASSERT(pb->pb_flags & _PBF_LOCKABLE);
-
- PB_TRACE(pb, "lock", 0);
- if (atomic_read(&pb->pb_io_remaining))
- blk_run_queues();
- down(&pb->pb_sema);
- PB_SET_OWNER(pb);
- PB_TRACE(pb, "locked", 0);
- return 0;
-}
-
-/*
- * pagebuf_unlock
- *
- * pagebuf_unlock releases the lock on the buffer object created by
- * pagebuf_lock or pagebuf_cond_lock (not any
- * pinning of underlying pages created by pagebuf_pin).
- */
-void
-pagebuf_unlock( /* unlock buffer */
- page_buf_t *pb) /* buffer to unlock */
-{
- ASSERT(pb->pb_flags & _PBF_LOCKABLE);
- PB_CLEAR_OWNER(pb);
- up(&pb->pb_sema);
- PB_TRACE(pb, "unlock", 0);
-}
-
-
-/*
- * Pinning Buffer Storage in Memory
- */
-
-/*
- * pagebuf_pin
- *
- * pagebuf_pin locks all of the memory represented by a buffer in
- * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for
- * the same or different buffers affecting a given page, will
- * properly count the number of outstanding "pin" requests. The
- * buffer may be released after the pagebuf_pin and a different
- * buffer used when calling pagebuf_unpin, if desired.
- * pagebuf_pin should be used by the file system when it wants be
- * assured that no attempt will be made to force the affected
- * memory to disk. It does not assure that a given logical page
- * will not be moved to a different physical page.
- */
-void
-pagebuf_pin(
- page_buf_t *pb)
-{
- atomic_inc(&pb->pb_pin_count);
- PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
-}
-
-/*
- * pagebuf_unpin
- *
- * pagebuf_unpin reverses the locking of memory performed by
- * pagebuf_pin. Note that both functions affected the logical
- * pages associated with the buffer, not the buffer itself.
- */
-void
-pagebuf_unpin(
- page_buf_t *pb)
-{
- if (atomic_dec_and_test(&pb->pb_pin_count)) {
- wake_up_all(&pb->pb_waiters);
- }
- PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
-}
-
-int
-pagebuf_ispin(
- page_buf_t *pb)
-{
- return atomic_read(&pb->pb_pin_count);
-}
-
-/*
- * pagebuf_wait_unpin
- *
- * pagebuf_wait_unpin waits until all of the memory associated
- * with the buffer is not longer locked in memory. It returns
- * immediately if none of the affected pages are locked.
- */
-static inline void
-_pagebuf_wait_unpin(
- page_buf_t *pb)
-{
- DECLARE_WAITQUEUE (wait, current);
-
- if (atomic_read(&pb->pb_pin_count) == 0)
- return;
-
- add_wait_queue(&pb->pb_waiters, &wait);
- for (;;) {
- current->state = TASK_UNINTERRUPTIBLE;
- if (atomic_read(&pb->pb_pin_count) == 0)
- break;
- if (atomic_read(&pb->pb_io_remaining))
- blk_run_queues();
- schedule();
- }
- remove_wait_queue(&pb->pb_waiters, &wait);
- current->state = TASK_RUNNING;
-}
-
-/*
- * Buffer Utility Routines
- */
-
-/*
- * pagebuf_iodone
- *
- * pagebuf_iodone marks a buffer for which I/O is in progress
- * done with respect to that I/O. The pb_iodone routine, if
- * present, will be called as a side-effect.
- */
-void
-pagebuf_iodone_work(
- void *v)
-{
- page_buf_t *pb = (page_buf_t *)v;
-
- if (pb->pb_iodone) {
- (*(pb->pb_iodone)) (pb);
- return;
- }
-
- if (pb->pb_flags & PBF_ASYNC) {
- if ((pb->pb_flags & _PBF_LOCKABLE) && !pb->pb_relse)
- pagebuf_unlock(pb);
- pagebuf_rele(pb);
- }
-}
-
-void
-pagebuf_iodone(
- page_buf_t *pb,
- int dataio,
- int schedule)
-{
- pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
- if (pb->pb_error == 0) {
- pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE);
- }
-
- PB_TRACE(pb, "iodone", pb->pb_iodone);
-
- if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
- if (schedule) {
- INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb);
- queue_work(dataio ? pagebuf_dataio_workqueue :
- pagebuf_logio_workqueue, &pb->pb_iodone_work);
- } else {
- pagebuf_iodone_work(pb);
- }
- } else {
- up(&pb->pb_iodonesema);
- }
-}
-
-/*
- * pagebuf_ioerror
- *
- * pagebuf_ioerror sets the error code for a buffer.
- */
-void
-pagebuf_ioerror( /* mark/clear buffer error flag */
- page_buf_t *pb, /* buffer to mark */
- unsigned int error) /* error to store (0 if none) */
-{
- pb->pb_error = error;
- PB_TRACE(pb, "ioerror", (unsigned long)error);
-}
-
-/*
- * pagebuf_iostart
- *
- * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
- * If necessary, it will arrange for any disk space allocation required,
- * and it will break up the request if the block mappings require it.
- * The pb_iodone routine in the buffer supplied will only be called
- * when all of the subsidiary I/O requests, if any, have been completed.
- * pagebuf_iostart calls the pagebuf_ioinitiate routine or
- * pagebuf_iorequest, if the former routine is not defined, to start
- * the I/O on a given low-level request.
- */
-int
-pagebuf_iostart( /* start I/O on a buffer */
- page_buf_t *pb, /* buffer to start */
- page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
- /* PBF_WRITE, PBF_DELWRI, */
- /* PBF_SYNC, PBF_DONT_BLOCK */
-{
- int status = 0;
-
- PB_TRACE(pb, "iostart", (unsigned long)flags);
-
- if (flags & PBF_DELWRI) {
- pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
- pb->pb_flags |= flags &
- (PBF_DELWRI | PBF_ASYNC | PBF_SYNC);
- pagebuf_delwri_queue(pb, 1);
- return status;
- }
-
- pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | \
- PBF_DELWRI | PBF_READ_AHEAD | PBF_RUN_QUEUES);
- pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \
- PBF_SYNC | PBF_READ_AHEAD | PBF_RUN_QUEUES);
-
- BUG_ON(pb->pb_bn == PAGE_BUF_DADDR_NULL);
-
- /* For writes allow an alternate strategy routine to precede
- * the actual I/O request (which may not be issued at all in
- * a shutdown situation, for example).
- */
- status = (flags & PBF_WRITE) ?
- pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
-
- /* Wait for I/O if we are not an async request.
- * Note: async I/O request completion will release the buffer,
- * and that can already be done by this point. So using the
- * buffer pointer from here on, after async I/O, is invalid.
- */
- if (!status && !(flags & PBF_ASYNC))
- status = pagebuf_iowait(pb);
-
- return status;
-}
-
-/*
- * Helper routine for pagebuf_iorequest
- */
-
-STATIC __inline__ int
-_pagebuf_iolocked(
- page_buf_t *pb)
-{
- ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
- if (pb->pb_flags & PBF_READ)
- return pb->pb_locked;
- return ((pb->pb_flags & _PBF_LOCKABLE) == 0);
-}
-
-STATIC __inline__ void
-_pagebuf_iodone(
- page_buf_t *pb,
- int schedule)
-{
- if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
- pb->pb_locked = 0;
- pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
- }
-}
-
-STATIC int
-bio_end_io_pagebuf(
- struct bio *bio,
- unsigned int bytes_done,
- int error)
-{
- page_buf_t *pb = (page_buf_t *)bio->bi_private;
- unsigned int i, blocksize = pb->pb_target->pbr_bsize;
- unsigned int sectorshift = pb->pb_target->pbr_sshift;
- struct bio_vec *bvec = bio->bi_io_vec;
-
- if (bio->bi_size)
- return 1;
-
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- pb->pb_error = EIO;
-
- for (i = 0; i < bio->bi_vcnt; i++, bvec++) {
- struct page *page = bvec->bv_page;
-
- if (pb->pb_error) {
- SetPageError(page);
- } else if (blocksize == PAGE_CACHE_SIZE) {
- SetPageUptodate(page);
- } else if (!PagePrivate(page)) {
- unsigned int j, range;
-
- ASSERT(blocksize < PAGE_CACHE_SIZE);
- range = (bvec->bv_offset + bvec->bv_len) >> sectorshift;
- for (j = bvec->bv_offset >> sectorshift; j < range; j++)
- set_bit(j, &page->private);
- if (page->private == (unsigned long)(PAGE_CACHE_SIZE-1))
- SetPageUptodate(page);
- }
-
- if (_pagebuf_iolocked(pb)) {
- unlock_page(page);
- }
- }
-
- _pagebuf_iodone(pb, 1);
- bio_put(bio);
- return 0;
-}
-
-void
-_pagebuf_ioapply(
- page_buf_t *pb)
-{
- int i, map_i, total_nr_pages, nr_pages;
- struct bio *bio;
- int offset = pb->pb_offset;
- int size = pb->pb_count_desired;
- sector_t sector = pb->pb_bn;
- unsigned int blocksize = pb->pb_target->pbr_bsize;
- int locking = _pagebuf_iolocked(pb);
-
- total_nr_pages = pb->pb_page_count;
- map_i = 0;
-
- /* Special code path for reading a sub page size pagebuf in --
- * we populate up the whole page, and hence the other metadata
- * in the same page. This optimization is only valid when the
- * filesystem block size and the page size are equal.
- */
- if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
- (pb->pb_flags & PBF_READ) && locking &&
- (blocksize == PAGE_CACHE_SIZE)) {
- bio = bio_alloc(GFP_NOIO, 1);
-
- bio->bi_bdev = pb->pb_target->pbr_bdev;
- bio->bi_sector = sector - (offset >> BBSHIFT);
- bio->bi_end_io = bio_end_io_pagebuf;
- bio->bi_private = pb;
-
- bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0);
- size = 0;
-
- atomic_inc(&pb->pb_io_remaining);
-
- goto submit_io;
- }
-
- /* Lock down the pages which we need to for the request */
- if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) {
- for (i = 0; size; i++) {
- int nbytes = PAGE_CACHE_SIZE - offset;
- struct page *page = pb->pb_pages[i];
-
- if (nbytes > size)
- nbytes = size;
-
- lock_page(page);
-
- size -= nbytes;
- offset = 0;
- }
- offset = pb->pb_offset;
- size = pb->pb_count_desired;
- }
-
-next_chunk:
- atomic_inc(&pb->pb_io_remaining);
- nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
- if (nr_pages > total_nr_pages)
- nr_pages = total_nr_pages;
-
- bio = bio_alloc(GFP_NOIO, nr_pages);
- bio->bi_bdev = pb->pb_target->pbr_bdev;
- bio->bi_sector = sector;
- bio->bi_end_io = bio_end_io_pagebuf;
- bio->bi_private = pb;
-
- for (; size && nr_pages; nr_pages--, map_i++) {
- int nbytes = PAGE_CACHE_SIZE - offset;
-
- if (nbytes > size)
- nbytes = size;
-
- if (bio_add_page(bio, pb->pb_pages[map_i],
- nbytes, offset) < nbytes)
- break;
-
- offset = 0;
- sector += nbytes >> BBSHIFT;
- size -= nbytes;
- total_nr_pages--;
- }
-
-submit_io:
- if (likely(bio->bi_size)) {
- submit_bio((pb->pb_flags & PBF_READ) ? READ : WRITE, bio);
- if (size)
- goto next_chunk;
- } else {
- bio_put(bio);
- pagebuf_ioerror(pb, EIO);
- }
-
- if (pb->pb_flags & PBF_RUN_QUEUES) {
- pb->pb_flags &= ~PBF_RUN_QUEUES;
- if (atomic_read(&pb->pb_io_remaining) > 1)
- blk_run_queues();
- }
-}
-
-/*
- * pagebuf_iorequest
- *
- * pagebuf_iorequest is the core I/O request routine.
- * It assumes that the buffer is well-formed and
- * mapped and ready for physical I/O, unlike
- * pagebuf_iostart() and pagebuf_iophysio(). Those
- * routines call the pagebuf_ioinitiate routine to start I/O,
- * if it is present, or else call pagebuf_iorequest()
- * directly if the pagebuf_ioinitiate routine is not present.
- *
- * This function will be responsible for ensuring access to the
- * pages is restricted whilst I/O is in progress - for locking
- * pagebufs the pagebuf lock is the mediator, for non-locking
- * pagebufs the pages will be locked. In the locking case we
- * need to use the pagebuf lock as multiple meta-data buffers
- * will reference the same page.
- */
-int
-pagebuf_iorequest( /* start real I/O */
- page_buf_t *pb) /* buffer to convey to device */
-{
- PB_TRACE(pb, "iorequest", 0);
-
- if (pb->pb_flags & PBF_DELWRI) {
- pagebuf_delwri_queue(pb, 1);
- return 0;
- }
-
- if (pb->pb_flags & PBF_WRITE) {
- _pagebuf_wait_unpin(pb);
- }
-
- pagebuf_hold(pb);
-
- /* Set the count to 1 initially, this will stop an I/O
- * completion callout which happens before we have started
- * all the I/O from calling pagebuf_iodone too early.
- */
- atomic_set(&pb->pb_io_remaining, 1);
- _pagebuf_ioapply(pb);
- _pagebuf_iodone(pb, 0);
-
- pagebuf_rele(pb);
- return 0;
-}
-
-/*
- * pagebuf_iowait
- *
- * pagebuf_iowait waits for I/O to complete on the buffer supplied.
- * It returns immediately if no I/O is pending. In any case, it returns
- * the error code, if any, or 0 if there is no error.
- */
-int
-pagebuf_iowait(
- page_buf_t *pb)
-{
- PB_TRACE(pb, "iowait", 0);
- if (atomic_read(&pb->pb_io_remaining))
- blk_run_queues();
- down(&pb->pb_iodonesema);
- PB_TRACE(pb, "iowaited", (long)pb->pb_error);
- return pb->pb_error;
-}
-
-STATIC void *
-pagebuf_mapout_locked(
- page_buf_t *pb)
-{
- void *old_addr = NULL;
-
- if (pb->pb_flags & PBF_MAPPED) {
- if (pb->pb_flags & _PBF_ADDR_ALLOCATED)
- old_addr = pb->pb_addr - pb->pb_offset;
- pb->pb_addr = NULL;
- pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED);
- }
-
- return old_addr; /* Caller must free the address space,
- * we are under a spin lock, probably
- * not safe to do vfree here
- */
-}
-
-caddr_t
-pagebuf_offset(
- page_buf_t *pb,
- size_t offset)
-{
- struct page *page;
-
- offset += pb->pb_offset;
-
- page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
- return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
-}
-
-/*
- * pagebuf_iomove
- *
- * Move data into or out of a buffer.
- */
-void
-pagebuf_iomove(
- page_buf_t *pb, /* buffer to process */
- size_t boff, /* starting buffer offset */
- size_t bsize, /* length to copy */
- caddr_t data, /* data address */
- page_buf_rw_t mode) /* read/write flag */
-{
- size_t bend, cpoff, csize;
- struct page *page;
-
- bend = boff + bsize;
- while (boff < bend) {
- page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
- cpoff = page_buf_poff(boff + pb->pb_offset);
- csize = min_t(size_t,
- PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
-
- ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
-
- switch (mode) {
- case PBRW_ZERO:
- memset(page_address(page) + cpoff, 0, csize);
- break;
- case PBRW_READ:
- memcpy(data, page_address(page) + cpoff, csize);
- break;
- case PBRW_WRITE:
- memcpy(page_address(page) + cpoff, data, csize);
- }
-
- boff += csize;
- data += csize;
- }
-}
-
-
-/*
- * Pagebuf delayed write buffer handling
- */
-
-STATIC int pbd_active = 1;
-STATIC LIST_HEAD(pbd_delwrite_queue);
-STATIC spinlock_t pbd_delwrite_lock = SPIN_LOCK_UNLOCKED;
-
-STATIC void
-pagebuf_delwri_queue(
- page_buf_t *pb,
- int unlock)
-{
- PB_TRACE(pb, "delwri_q", (long)unlock);
- spin_lock(&pbd_delwrite_lock);
- /* If already in the queue, dequeue and place at tail */
- if (!list_empty(&pb->pb_list)) {
- if (unlock) {
- atomic_dec(&pb->pb_hold);
- }
- list_del(&pb->pb_list);
- }
-
- list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
- pb->pb_flushtime = jiffies + pb_params.age_buffer.val;
- spin_unlock(&pbd_delwrite_lock);
-
- if (unlock && (pb->pb_flags & _PBF_LOCKABLE)) {
- pagebuf_unlock(pb);
- }
-}
-
-void
-pagebuf_delwri_dequeue(
- page_buf_t *pb)
-{
- PB_TRACE(pb, "delwri_uq", 0);
- spin_lock(&pbd_delwrite_lock);
- list_del_init(&pb->pb_list);
- pb->pb_flags &= ~PBF_DELWRI;
- spin_unlock(&pbd_delwrite_lock);
-}
-
-STATIC void
-pagebuf_runall_queues(
- struct workqueue_struct *queue)
-{
- flush_workqueue(queue);
-}
-
-/* Defines for pagebuf daemon */
-DECLARE_WAIT_QUEUE_HEAD(pbd_waitq);
-STATIC int force_flush;
-
-STATIC void
-pagebuf_daemon_wakeup(
- int flag)
-{
- force_flush = flag;
- if (waitqueue_active(&pbd_waitq)) {
- wake_up_interruptible(&pbd_waitq);
- }
-}
-
-typedef void (*timeout_fn)(unsigned long);
-
-STATIC int
-pagebuf_daemon(
- void *data)
-{
- int count;
- page_buf_t *pb;
- struct list_head *curr, *next, tmp;
- struct timer_list pb_daemon_timer =
- TIMER_INITIALIZER((timeout_fn)pagebuf_daemon_wakeup, 0, 0);
-
- /* Set up the thread */
- daemonize("pagebufd");
-
- current->flags |= PF_MEMALLOC;
-
- INIT_LIST_HEAD(&tmp);
- do {
- /* swsusp */
- if (current->flags & PF_FREEZE)
- refrigerator(PF_IOTHREAD);
-
- if (pbd_active == 1) {
- mod_timer(&pb_daemon_timer,
- jiffies + pb_params.flush_interval.val);
- interruptible_sleep_on(&pbd_waitq);
- }
-
- if (pbd_active == 0) {
- del_timer_sync(&pb_daemon_timer);
- }
-
- spin_lock(&pbd_delwrite_lock);
-
- count = 0;
- list_for_each_safe(curr, next, &pbd_delwrite_queue) {
- pb = list_entry(curr, page_buf_t, pb_list);
-
- PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
-
- if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) &&
- (((pb->pb_flags & _PBF_LOCKABLE) == 0) ||
- !pagebuf_cond_lock(pb))) {
-
- if (!force_flush &&
- time_before(jiffies, pb->pb_flushtime)) {
- pagebuf_unlock(pb);
- break;
- }
-
- pb->pb_flags &= ~PBF_DELWRI;
- pb->pb_flags |= PBF_WRITE;
-
- list_del(&pb->pb_list);
- list_add(&pb->pb_list, &tmp);
-
- count++;
- }
- }
-
- spin_unlock(&pbd_delwrite_lock);
- while (!list_empty(&tmp)) {
- pb = list_entry(tmp.next, page_buf_t, pb_list);
- list_del_init(&pb->pb_list);
-
- pagebuf_iostrategy(pb);
- }
-
- if (as_list_len > 0)
- purge_addresses();
- if (count)
- blk_run_queues();
-
- force_flush = 0;
- } while (pbd_active == 1);
-
- pbd_active = -1;
- wake_up_interruptible(&pbd_waitq);
-
- return 0;
-}
-
-void
-pagebuf_delwri_flush(
- pb_target_t *target,
- u_long flags,
- int *pinptr)
-{
- page_buf_t *pb;
- struct list_head *curr, *next, tmp;
- int pincount = 0;
- int flush_cnt = 0;
-
- pagebuf_runall_queues(pagebuf_dataio_workqueue);
- pagebuf_runall_queues(pagebuf_logio_workqueue);
-
- spin_lock(&pbd_delwrite_lock);
- INIT_LIST_HEAD(&tmp);
-
- list_for_each_safe(curr, next, &pbd_delwrite_queue) {
- pb = list_entry(curr, page_buf_t, pb_list);
-
- /*
- * Skip other targets, markers and in progress buffers
- */
-
- if ((pb->pb_flags == 0) || (pb->pb_target != target) ||
- !(pb->pb_flags & PBF_DELWRI)) {
- continue;
- }
-
- PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
- if (pagebuf_ispin(pb)) {
- pincount++;
- continue;
- }
-
- pb->pb_flags &= ~PBF_DELWRI;
- pb->pb_flags |= PBF_WRITE;
- list_move(&pb->pb_list, &tmp);
- }
- /* ok found all the items that can be worked on
- * drop the lock and process the private list */
- spin_unlock(&pbd_delwrite_lock);
-
- list_for_each_safe(curr, next, &tmp) {
- pb = list_entry(curr, page_buf_t, pb_list);
-
- if (flags & PBDF_WAIT)
- pb->pb_flags &= ~PBF_ASYNC;
- else
- list_del_init(curr);
-
- pagebuf_lock(pb);
- pagebuf_iostrategy(pb);
- if (++flush_cnt > 32) {
- blk_run_queues();
- flush_cnt = 0;
- }
- }
-
- blk_run_queues();
-
- while (!list_empty(&tmp)) {
- pb = list_entry(tmp.next, page_buf_t, pb_list);
-
- list_del_init(&pb->pb_list);
- pagebuf_iowait(pb);
- if (!pb->pb_relse)
- pagebuf_unlock(pb);
- pagebuf_rele(pb);
- }
-
- if (pinptr)
- *pinptr = pincount;
-}
-
-STATIC int
-pagebuf_daemon_start(void)
-{
- int rval;
-
- pagebuf_logio_workqueue = create_workqueue("xfslogd");
- if (!pagebuf_logio_workqueue)
- return -ENOMEM;
-
- pagebuf_dataio_workqueue = create_workqueue("xfsdatad");
- if (!pagebuf_dataio_workqueue) {
- destroy_workqueue(pagebuf_logio_workqueue);
- return -ENOMEM;
- }
-
- rval = kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES);
- if (rval < 0) {
- destroy_workqueue(pagebuf_logio_workqueue);
- destroy_workqueue(pagebuf_dataio_workqueue);
- }
-
- return rval;
-}
-
-/*
- * pagebuf_daemon_stop
- *
- * Note: do not mark as __exit, it is called from pagebuf_terminate.
- */
-STATIC void
-pagebuf_daemon_stop(void)
-{
- pbd_active = 0;
- wake_up_interruptible(&pbd_waitq);
- wait_event_interruptible(pbd_waitq, pbd_active);
- destroy_workqueue(pagebuf_logio_workqueue);
- destroy_workqueue(pagebuf_dataio_workqueue);
-}
-
-
-/*
- * Pagebuf sysctl interface
- */
-
-STATIC int
-pb_stats_clear_handler(
- ctl_table *ctl,
- int write,
- struct file *filp,
- void *buffer,
- size_t *lenp)
-{
- int c, ret;
- int *valp = ctl->data;
-
- ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp);
-
- if (!ret && write && *valp) {
- printk("XFS Clearing pbstats\n");
- for (c = 0; c < NR_CPUS; c++) {
- if (!cpu_possible(c)) continue;
- memset(&per_cpu(pbstats, c), 0,
- sizeof(struct pbstats));
- }
- pb_params.stats_clear.val = 0;
- }
-
- return ret;
-}
-
-STATIC struct ctl_table_header *pagebuf_table_header;
-
-STATIC ctl_table pagebuf_table[] = {
- {PB_FLUSH_INT, "flush_int", &pb_params.flush_interval.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &pb_params.flush_interval.min, &pb_params.flush_interval.max},
-
- {PB_FLUSH_AGE, "flush_age", &pb_params.age_buffer.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &pb_params.age_buffer.min, &pb_params.age_buffer.max},
-
- {PB_STATS_CLEAR, "stats_clear", &pb_params.stats_clear.val,
- sizeof(int), 0644, NULL, &pb_stats_clear_handler,
- &sysctl_intvec, NULL,
- &pb_params.stats_clear.min, &pb_params.stats_clear.max},
-
-#ifdef PAGEBUF_TRACE
- {PB_DEBUG, "debug", &pb_params.debug.val,
- sizeof(int), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL,
- &pb_params.debug.min, &pb_params.debug.max},
-#endif
- {0}
-};
-
-STATIC ctl_table pagebuf_dir_table[] = {
- {VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table},
- {0}
-};
-
-STATIC ctl_table pagebuf_root_table[] = {
- {CTL_VM, "vm", NULL, 0, 0555, pagebuf_dir_table},
- {0}
-};
-
-#ifdef CONFIG_PROC_FS
-STATIC int
-pagebuf_readstats(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
-{
- int c, i, len, val;
-
- len = 0;
- len += sprintf(buffer + len, "pagebuf");
- for (i = 0; i < sizeof(struct pbstats) / sizeof(u_int32_t); i++) {
- val = 0;
- for (c = 0 ; c < NR_CPUS; c++) {
- if (!cpu_possible(c)) continue;
- val += *(((u_int32_t*)&per_cpu(pbstats, c) + i));
- }
- len += sprintf(buffer + len, " %u", val);
- }
- buffer[len++] = '\n';
-
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
-}
-#endif /* CONFIG_PROC_FS */
-
-/*
- * Initialization and Termination
- */
-
-int __init
-pagebuf_init(void)
-{
- int i;
-
- pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1);
-
-#ifdef CONFIG_PROC_FS
- if (proc_mkdir("fs/pagebuf", 0))
- create_proc_read_entry(
- "fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL);
-#endif
-
- pagebuf_cache = kmem_cache_create("page_buf_t", sizeof(page_buf_t), 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
- if (pagebuf_cache == NULL) {
- printk("pagebuf: couldn't init pagebuf cache\n");
- pagebuf_terminate();
- return -ENOMEM;
- }
-
- for (i = 0; i < NHASH; i++) {
- spin_lock_init(&pbhash[i].pb_hash_lock);
- INIT_LIST_HEAD(&pbhash[i].pb_hash);
- }
-
-#ifdef PAGEBUF_TRACE
- pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
-#endif
-
- pagebuf_daemon_start();
- return 0;
-}
-
-
-/*
- * pagebuf_terminate.
- *
- * Note: do not mark as __exit, this is also called from the __init code.
- */
-void
-pagebuf_terminate(void)
-{
- pagebuf_daemon_stop();
-
- kmem_cache_destroy(pagebuf_cache);
-
- unregister_sysctl_table(pagebuf_table_header);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("fs/pagebuf/stat", NULL);
- remove_proc_entry("fs/pagebuf", NULL);
-#endif
-}
-
-
-/*
- * Module management (for kernel debugger module)
- */
-EXPORT_SYMBOL(pagebuf_offset);
-#ifdef DEBUG
-EXPORT_SYMBOL(pbd_delwrite_queue);
-#endif
+++ /dev/null
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-
-/*
- * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI
- */
-
-#ifndef __PAGE_BUF_H__
-#define __PAGE_BUF_H__
-
-#include <linux/config.h>
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <asm/system.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/uio.h>
-
-/*
- * Base types
- */
-
-/* daddr must be signed since -1 is used for bmaps that are not yet allocated */
-typedef loff_t page_buf_daddr_t;
-
-#define PAGE_BUF_DADDR_NULL ((page_buf_daddr_t) (-1LL))
-
-#define page_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE)
-#define page_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT)
-#define page_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT)
-#define page_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK)
-
-typedef enum page_buf_rw_e {
- PBRW_READ = 1, /* transfer into target memory */
- PBRW_WRITE = 2, /* transfer from target memory */
- PBRW_ZERO = 3 /* Zero target memory */
-} page_buf_rw_t;
-
-
-typedef enum page_buf_flags_e { /* pb_flags values */
- PBF_READ = (1 << 0), /* buffer intended for reading from device */
- PBF_WRITE = (1 << 1), /* buffer intended for writing to device */
- PBF_MAPPED = (1 << 2), /* buffer mapped (pb_addr valid) */
- PBF_PARTIAL = (1 << 3), /* buffer partially read */
- PBF_ASYNC = (1 << 4), /* initiator will not wait for completion */
- PBF_NONE = (1 << 5), /* buffer not read at all */
- PBF_DELWRI = (1 << 6), /* buffer has dirty pages */
- PBF_FREED = (1 << 7), /* buffer has been freed and is invalid */
- PBF_SYNC = (1 << 8), /* force updates to disk */
- PBF_MAPPABLE = (1 << 9),/* use directly-addressable pages */
- PBF_STALE = (1 << 10), /* buffer has been staled, do not find it */
- PBF_FS_MANAGED = (1 << 11), /* filesystem controls freeing memory */
- PBF_FS_DATAIOD = (1 << 12), /* schedule IO completion on fs datad */
-
- /* flags used only as arguments to access routines */
- PBF_LOCK = (1 << 13), /* lock requested */
- PBF_TRYLOCK = (1 << 14), /* lock requested, but do not wait */
- PBF_DONT_BLOCK = (1 << 15), /* do not block in current thread */
-
- /* flags used only internally */
- _PBF_LOCKABLE = (1 << 16), /* page_buf_t may be locked */
- _PBF_PRIVATE_BH = (1 << 17), /* do not use public buffer heads */
- _PBF_ALL_PAGES_MAPPED = (1 << 18), /* all pages in range mapped */
- _PBF_ADDR_ALLOCATED = (1 << 19), /* pb_addr space was allocated */
- _PBF_MEM_ALLOCATED = (1 << 20), /* underlying pages are allocated */
- _PBF_MEM_SLAB = (1 << 21), /* underlying pages are slab allocated */
-
- PBF_FORCEIO = (1 << 22), /* ignore any cache state */
- PBF_FLUSH = (1 << 23), /* flush disk write cache */
- PBF_READ_AHEAD = (1 << 24), /* asynchronous read-ahead */
- PBF_RUN_QUEUES = (1 << 25), /* run block device task queue */
-
-} page_buf_flags_t;
-
-#define PBF_UPDATE (PBF_READ | PBF_WRITE)
-#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0)
-#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0)
-
-typedef struct pb_target {
- dev_t pbr_dev;
- struct block_device *pbr_bdev;
- struct address_space *pbr_mapping;
- unsigned int pbr_bsize;
- unsigned int pbr_sshift;
- size_t pbr_smask;
-} pb_target_t;
-
-/*
- * page_buf_t: Buffer structure for page cache-based buffers
- *
- * This buffer structure is used by the page cache buffer management routines
- * to refer to an assembly of pages forming a logical buffer. The actual
- * I/O is performed with buffer_head or bio structures, as required by drivers,
- * for drivers which do not understand this structure. The buffer structure is
- * used on temporary basis only, and discarded when released.
- *
- * The real data storage is recorded in the page cache. Metadata is
- * hashed to the inode for the block device on which the file system resides.
- * File data is hashed to the inode for the file. Pages which are only
- * partially filled with data have bits set in their block_map entry
- * to indicate which disk blocks in the page are not valid.
- */
-
-struct page_buf_s;
-typedef void (*page_buf_iodone_t)(struct page_buf_s *);
- /* call-back function on I/O completion */
-typedef void (*page_buf_relse_t)(struct page_buf_s *);
- /* call-back function on I/O completion */
-typedef int (*page_buf_bdstrat_t)(struct page_buf_s *);
-
-#define PB_PAGES 4
-
-typedef struct page_buf_s {
- struct semaphore pb_sema; /* semaphore for lockables */
- unsigned long pb_flushtime; /* time to flush pagebuf */
- atomic_t pb_pin_count; /* pin count */
- wait_queue_head_t pb_waiters; /* unpin waiters */
- struct list_head pb_list;
- page_buf_flags_t pb_flags; /* status flags */
- struct list_head pb_hash_list;
- struct pb_target *pb_target; /* logical object */
- atomic_t pb_hold; /* reference count */
- page_buf_daddr_t pb_bn; /* block number for I/O */
- loff_t pb_file_offset; /* offset in file */
- size_t pb_buffer_length; /* size of buffer in bytes */
- size_t pb_count_desired; /* desired transfer size */
- void *pb_addr; /* virtual address of buffer */
- struct work_struct pb_iodone_work;
- atomic_t pb_io_remaining;/* #outstanding I/O requests */
- page_buf_iodone_t pb_iodone; /* I/O completion function */
- page_buf_relse_t pb_relse; /* releasing function */
- page_buf_bdstrat_t pb_strat; /* pre-write function */
- struct semaphore pb_iodonesema; /* Semaphore for I/O waiters */
- void *pb_fspriv;
- void *pb_fspriv2;
- void *pb_fspriv3;
- unsigned short pb_error; /* error code on I/O */
- unsigned short pb_page_count; /* size of page array */
- unsigned short pb_offset; /* page offset in first page */
- unsigned char pb_locked; /* page array is locked */
- unsigned char pb_hash_index; /* hash table index */
- struct page **pb_pages; /* array of page pointers */
- struct page *pb_page_array[PB_PAGES]; /* inline pages */
-#ifdef PAGEBUF_LOCK_TRACKING
- int pb_last_holder;
-#endif
-} page_buf_t;
-
-
-/* Finding and Reading Buffers */
-
-extern page_buf_t *pagebuf_find( /* find buffer for block if */
- /* the block is in memory */
- struct pb_target *, /* inode for block */
- loff_t, /* starting offset of range */
- size_t, /* length of range */
- page_buf_flags_t); /* PBF_LOCK */
-
-extern page_buf_t *pagebuf_get( /* allocate a buffer */
- struct pb_target *, /* inode for buffer */
- loff_t, /* starting offset of range */
- size_t, /* length of range */
- page_buf_flags_t); /* PBF_LOCK, PBF_READ, */
- /* PBF_ASYNC */
-
-extern page_buf_t *pagebuf_lookup(
- struct pb_target *,
- loff_t, /* starting offset of range */
- size_t, /* length of range */
- page_buf_flags_t); /* PBF_READ, PBF_WRITE, */
- /* PBF_FORCEIO, _PBF_LOCKABLE */
-
-extern page_buf_t *pagebuf_get_empty( /* allocate pagebuf struct with */
- /* no memory or disk address */
- size_t len,
- struct pb_target *); /* mount point "fake" inode */
-
-extern page_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct */
- /* without disk address */
- size_t len,
- struct pb_target *); /* mount point "fake" inode */
-
-extern int pagebuf_associate_memory(
- page_buf_t *,
- void *,
- size_t);
-
-extern void pagebuf_hold( /* increment reference count */
- page_buf_t *); /* buffer to hold */
-
-extern void pagebuf_readahead( /* read ahead into cache */
- struct pb_target *, /* target for buffer (or NULL) */
- loff_t, /* starting offset of range */
- size_t, /* length of range */
- page_buf_flags_t); /* additional read flags */
-
-/* Releasing Buffers */
-
-extern void pagebuf_free( /* deallocate a buffer */
- page_buf_t *); /* buffer to deallocate */
-
-extern void pagebuf_rele( /* release hold on a buffer */
- page_buf_t *); /* buffer to release */
-
-/* Locking and Unlocking Buffers */
-
-extern int pagebuf_cond_lock( /* lock buffer, if not locked */
- /* (returns -EBUSY if locked) */
- page_buf_t *); /* buffer to lock */
-
-extern int pagebuf_lock_value( /* return count on lock */
- page_buf_t *); /* buffer to check */
-
-extern int pagebuf_lock( /* lock buffer */
- page_buf_t *); /* buffer to lock */
-
-extern void pagebuf_unlock( /* unlock buffer */
- page_buf_t *); /* buffer to unlock */
-
-/* Buffer Read and Write Routines */
-
-extern void pagebuf_iodone( /* mark buffer I/O complete */
- page_buf_t *, /* buffer to mark */
- int, /* use data/log helper thread. */
- int); /* run completion locally, or in
- * a helper thread. */
-
-extern void pagebuf_ioerror( /* mark buffer in error (or not) */
- page_buf_t *, /* buffer to mark */
- unsigned int); /* error to store (0 if none) */
-
-extern int pagebuf_iostart( /* start I/O on a buffer */
- page_buf_t *, /* buffer to start */
- page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC, */
- /* PBF_READ, PBF_WRITE, */
- /* PBF_DELWRI, PBF_SYNC */
-
-extern int pagebuf_iorequest( /* start real I/O */
- page_buf_t *); /* buffer to convey to device */
-
-extern int pagebuf_iowait( /* wait for buffer I/O done */
- page_buf_t *); /* buffer to wait on */
-
-extern void pagebuf_iomove( /* move data in/out of pagebuf */
- page_buf_t *, /* buffer to manipulate */
- size_t, /* starting buffer offset */
- size_t, /* length in buffer */
- caddr_t, /* data pointer */
- page_buf_rw_t); /* direction */
-
-static inline int pagebuf_iostrategy(page_buf_t *pb)
-{
- return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb);
-}
-
-static inline int pagebuf_geterror(page_buf_t *pb)
-{
- return pb ? pb->pb_error : ENOMEM;
-}
-
-/* Buffer Utility Routines */
-
-extern caddr_t pagebuf_offset( /* pointer at offset in buffer */
- page_buf_t *, /* buffer to offset into */
- size_t); /* offset */
-
-/* Pinning Buffer Storage in Memory */
-
-extern void pagebuf_pin( /* pin buffer in memory */
- page_buf_t *); /* buffer to pin */
-
-extern void pagebuf_unpin( /* unpin buffered data */
- page_buf_t *); /* buffer to unpin */
-
-extern int pagebuf_ispin( /* check if buffer is pinned */
- page_buf_t *); /* buffer to check */
-
-/* Delayed Write Buffer Routines */
-
-#define PBDF_WAIT 0x01
-extern void pagebuf_delwri_flush(
- pb_target_t *,
- unsigned long,
- int *);
-
-extern void pagebuf_delwri_dequeue(
- page_buf_t *);
-
-/* Buffer Daemon Setup Routines */
-
-extern int pagebuf_init(void);
-extern void pagebuf_terminate(void);
-
-
-#ifdef PAGEBUF_TRACE
-extern ktrace_t *pagebuf_trace_buf;
-extern void pagebuf_trace(
- page_buf_t *, /* buffer being traced */
- char *, /* description of operation */
- void *, /* arbitrary diagnostic value */
- void *); /* return address */
-#else
-# define pagebuf_trace(pb, id, ptr, ra) do { } while (0)
-#endif
-
-#define pagebuf_target_name(target) \
- ({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; })
-
-#endif /* __PAGE_BUF_H__ */
+++ /dev/null
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_KMEM_H__
-#define __XFS_SUPPORT_KMEM_H__
-
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-/*
- * Cutoff point to use vmalloc instead of kmalloc.
- */
-#define MAX_SLAB_SIZE 0x10000
-
-/*
- * XFS uses slightly different names for these due to the
- * IRIX heritage.
- */
-#define kmem_zone kmem_cache_s
-#define kmem_zone_t kmem_cache_t
-
-#define KM_SLEEP 0x0001
-#define KM_NOSLEEP 0x0002
-#define KM_NOFS 0x0004
-
-typedef unsigned long xfs_pflags_t;
-
-#define PFLAGS_TEST_FSTRANS() (current->flags & PF_FSTRANS)
-
-#define PFLAGS_SET_FSTRANS(STATEP) do { \
- *(STATEP) = current->flags; \
- current->flags |= PF_FSTRANS; \
-} while (0)
-
-#define PFLAGS_RESTORE(STATEP) do { \
- current->flags = *(STATEP); \
-} while (0)
-
-#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \
- *(NSTATEP) = *(OSTATEP); \
-} while (0)
-
-/*
- * XXX get rid of the unconditional __GFP_NOFAIL by adding
- * a KM_FAIL flag and using it where we're allowed to fail.
- */
-static __inline unsigned int
-kmem_flags_convert(int flags)
-{
- int lflags;
-
-#if DEBUG
- if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS))) {
- printk(KERN_WARNING
- "XFS: memory allocation with wrong flags (%x)\n", flags);
- BUG();
- }
-#endif
-
- lflags = (flags & KM_NOSLEEP) ? GFP_ATOMIC : (GFP_KERNEL|__GFP_NOFAIL);
-
- /* avoid recusive callbacks to filesystem during transactions */
- if (PFLAGS_TEST_FSTRANS())
- lflags &= ~__GFP_FS;
-
- return lflags;
-}
-
-static __inline void *
-kmem_alloc(size_t size, int flags)
-{
- if (unlikely(MAX_SLAB_SIZE < size))
- /* Avoid doing filesystem sensitive stuff to get this */
- return __vmalloc(size, kmem_flags_convert(flags), PAGE_KERNEL);
- return kmalloc(size, kmem_flags_convert(flags));
-}
-
-static __inline void *
-kmem_zalloc(size_t size, int flags)
-{
- void *ptr = kmem_alloc(size, flags);
- if (likely(ptr != NULL))
- memset(ptr, 0, size);
- return ptr;
-}
-
-static __inline void
-kmem_free(void *ptr, size_t size)
-{
- if (unlikely((unsigned long)ptr < VMALLOC_START ||
- (unsigned long)ptr >= VMALLOC_END))
- kfree(ptr);
- else
- vfree(ptr);
-}
-
-static __inline void *
-kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags)
-{
- void *new = kmem_alloc(newsize, flags);
-
- if (likely(ptr != NULL)) {
- if (likely(new != NULL))
- memcpy(new, ptr, min(oldsize, newsize));
- kmem_free(ptr, oldsize);
- }
-
- return new;
-}
-
-static __inline kmem_zone_t *
-kmem_zone_init(int size, char *zone_name)
-{
- return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL);
-}
-
-static __inline void *
-kmem_zone_alloc(kmem_zone_t *zone, int flags)
-{
- return kmem_cache_alloc(zone, kmem_flags_convert(flags));
-}
-
-static __inline void *
-kmem_zone_zalloc(kmem_zone_t *zone, int flags)
-{
- void *ptr = kmem_zone_alloc(zone, flags);
- if (likely(ptr != NULL))
- memset(ptr, 0, kmem_cache_size(zone));
- return ptr;
-}
-
-static __inline void
-kmem_zone_free(kmem_zone_t *zone, void *ptr)
-{
- kmem_cache_free(zone, ptr);
-}
-
-typedef struct shrinker *kmem_shaker_t;
-typedef int (*kmem_shake_func_t)(int, unsigned int);
-
-static __inline kmem_shaker_t
-kmem_shake_register(kmem_shake_func_t sfunc)
-{
- return set_shrinker(DEFAULT_SEEKS, sfunc);
-}
-
-static __inline void
-kmem_shake_deregister(kmem_shaker_t shrinker)
-{
- remove_shrinker(shrinker);
-}
-
-static __inline int
-kmem_shake_allow(unsigned int gfp_mask)
-{
- return (gfp_mask & __GFP_WAIT);
-}
-
-#endif /* __XFS_SUPPORT_KMEM_H__ */
#include <linux/slab.h>
#include <xfs_types.h>
-#include "kmem.h"
-#include "spin.h"
+#include <kmem.h>
+#include <spin.h>
#include "debug.h"
#include "ktrace.h"
#ifndef __XFS_SUPPORT_KTRACE_H__
#define __XFS_SUPPORT_KTRACE_H__
-#include <support/spin.h>
+#include <spin.h>
/*
* Trace buffer entry structure.
+++ /dev/null
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-
-#include <linux/time.h>
-#include <linux/sched.h>
-#include <asm/system.h>
-#include <linux/interrupt.h>
-#include <asm/current.h>
-
-#include "mrlock.h"
-
-
-#if USE_RW_WAIT_QUEUE_SPINLOCK
-# define wq_write_lock write_lock
-#else
-# define wq_write_lock spin_lock
-#endif
-
-/*
- * We don't seem to need lock_type (only one supported), name, or
- * sequence. But, XFS will pass it so let's leave them here for now.
- */
-/* ARGSUSED */
-void
-mrlock_init(mrlock_t *mrp, int lock_type, char *name, long sequence)
-{
- mrp->mr_count = 0;
- mrp->mr_reads_waiting = 0;
- mrp->mr_writes_waiting = 0;
- init_waitqueue_head(&mrp->mr_readerq);
- init_waitqueue_head(&mrp->mr_writerq);
- mrp->mr_lock = SPIN_LOCK_UNLOCKED;
-}
-
-/*
- * Macros to lock/unlock the mrlock_t.
- */
-
-#define MRLOCK(m) spin_lock(&(m)->mr_lock);
-#define MRUNLOCK(m) spin_unlock(&(m)->mr_lock);
-
-
-/*
- * lock_wait should never be called in an interrupt thread.
- *
- * mrlocks can sleep (i.e. call schedule) and so they can't ever
- * be called from an interrupt thread.
- *
- * threads that wake-up should also never be invoked from interrupt threads.
- *
- * But, waitqueue_lock is locked from interrupt threads - and we are
- * called with interrupts disabled, so it is all OK.
- */
-
-/* ARGSUSED */
-void
-lock_wait(wait_queue_head_t *q, spinlock_t *lock, int rw)
-{
- DECLARE_WAITQUEUE( wait, current );
-
- __set_current_state(TASK_UNINTERRUPTIBLE);
-
- spin_lock(&q->lock);
- if (rw) {
- __add_wait_queue_tail(q, &wait);
- } else {
- __add_wait_queue(q, &wait);
- }
-
- spin_unlock(&q->lock);
- spin_unlock(lock);
-
- schedule();
-
- spin_lock(&q->lock);
- __remove_wait_queue(q, &wait);
- spin_unlock(&q->lock);
-
- spin_lock(lock);
-
- /* return with lock held */
-}
-
-/* ARGSUSED */
-void
-mrfree(mrlock_t *mrp)
-{
-}
-
-/* ARGSUSED */
-void
-mrlock(mrlock_t *mrp, int type, int flags)
-{
- if (type == MR_ACCESS)
- mraccess(mrp);
- else
- mrupdate(mrp);
-}
-
-/* ARGSUSED */
-void
-mraccessf(mrlock_t *mrp, int flags)
-{
- MRLOCK(mrp);
- if(mrp->mr_writes_waiting > 0) {
- mrp->mr_reads_waiting++;
- lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
- mrp->mr_reads_waiting--;
- }
- while (mrp->mr_count < 0) {
- mrp->mr_reads_waiting++;
- lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
- mrp->mr_reads_waiting--;
- }
- mrp->mr_count++;
- MRUNLOCK(mrp);
-}
-
-/* ARGSUSED */
-void
-mrupdatef(mrlock_t *mrp, int flags)
-{
- MRLOCK(mrp);
- while(mrp->mr_count) {
- mrp->mr_writes_waiting++;
- lock_wait(&mrp->mr_writerq, &mrp->mr_lock, 1);
- mrp->mr_writes_waiting--;
- }
-
- mrp->mr_count = -1; /* writer on it */
- MRUNLOCK(mrp);
-}
-
-int
-mrtryaccess(mrlock_t *mrp)
-{
- MRLOCK(mrp);
- /*
- * If anyone is waiting for update access or the lock is held for update
- * fail the request.
- */
- if(mrp->mr_writes_waiting > 0 || mrp->mr_count < 0) {
- MRUNLOCK(mrp);
- return 0;
- }
- mrp->mr_count++;
- MRUNLOCK(mrp);
- return 1;
-}
-
-int
-mrtrypromote(mrlock_t *mrp)
-{
- MRLOCK(mrp);
-
- if(mrp->mr_count == 1) { /* We are the only thread with the lock */
- mrp->mr_count = -1; /* writer on it */
- MRUNLOCK(mrp);
- return 1;
- }
-
- MRUNLOCK(mrp);
- return 0;
-}
-
-int
-mrtryupdate(mrlock_t *mrp)
-{
- MRLOCK(mrp);
-
- if(mrp->mr_count) {
- MRUNLOCK(mrp);
- return 0;
- }
-
- mrp->mr_count = -1; /* writer on it */
- MRUNLOCK(mrp);
- return 1;
-}
-
-static __inline__ void mrwake(mrlock_t *mrp)
-{
- /*
- * First, if the count is now 0, we need to wake-up anyone waiting.
- */
- if (!mrp->mr_count) {
- if (mrp->mr_writes_waiting) { /* Wake-up first writer waiting */
- wake_up(&mrp->mr_writerq);
- } else if (mrp->mr_reads_waiting) { /* Wakeup any readers waiting */
- wake_up(&mrp->mr_readerq);
- }
- }
-}
-
-void
-mraccunlock(mrlock_t *mrp)
-{
- MRLOCK(mrp);
- mrp->mr_count--;
- mrwake(mrp);
- MRUNLOCK(mrp);
-}
-
-void
-mrunlock(mrlock_t *mrp)
-{
- MRLOCK(mrp);
- if (mrp->mr_count < 0) {
- mrp->mr_count = 0;
- } else {
- mrp->mr_count--;
- }
- mrwake(mrp);
- MRUNLOCK(mrp);
-}
-
-int
-ismrlocked(mrlock_t *mrp, int type) /* No need to lock since info can change */
-{
- if (type == MR_ACCESS)
- return (mrp->mr_count > 0); /* Read lock */
- else if (type == MR_UPDATE)
- return (mrp->mr_count < 0); /* Write lock */
- else if (type == (MR_UPDATE | MR_ACCESS))
- return (mrp->mr_count); /* Any type of lock held */
- else /* Any waiters */
- return (mrp->mr_reads_waiting | mrp->mr_writes_waiting);
-}
-
-/*
- * Demote from update to access. We better be the only thread with the
- * lock in update mode so it should be easy to set to 1.
- * Wake-up any readers waiting.
- */
-
-void
-mrdemote(mrlock_t *mrp)
-{
- MRLOCK(mrp);
- mrp->mr_count = 1;
- if (mrp->mr_reads_waiting) { /* Wakeup all readers waiting */
- wake_up(&mrp->mr_readerq);
- }
- MRUNLOCK(mrp);
-}
+++ /dev/null
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_MRLOCK_H__
-#define __XFS_SUPPORT_MRLOCK_H__
-
-#include <linux/time.h>
-#include <linux/wait.h>
-#include <asm/atomic.h>
-#include <asm/semaphore.h>
-
-/*
- * Implement mrlocks on Linux that work for XFS.
- *
- * These are sleep locks and not spinlocks. If one wants read/write spinlocks,
- * use read_lock, write_lock, ... see spinlock.h.
- */
-
-typedef struct mrlock_s {
- int mr_count;
- unsigned short mr_reads_waiting;
- unsigned short mr_writes_waiting;
- wait_queue_head_t mr_readerq;
- wait_queue_head_t mr_writerq;
- spinlock_t mr_lock;
-} mrlock_t;
-
-#define MR_ACCESS 1
-#define MR_UPDATE 2
-
-#define MRLOCK_BARRIER 0x1
-#define MRLOCK_ALLOW_EQUAL_PRI 0x8
-
-/*
- * mraccessf/mrupdatef take flags to be passed in while sleeping;
- * only PLTWAIT is currently supported.
- */
-
-extern void mraccessf(mrlock_t *, int);
-extern void mrupdatef(mrlock_t *, int);
-extern void mrlock(mrlock_t *, int, int);
-extern void mrunlock(mrlock_t *);
-extern void mraccunlock(mrlock_t *);
-extern int mrtryupdate(mrlock_t *);
-extern int mrtryaccess(mrlock_t *);
-extern int mrtrypromote(mrlock_t *);
-extern void mrdemote(mrlock_t *);
-
-extern int ismrlocked(mrlock_t *, int);
-extern void mrlock_init(mrlock_t *, int type, char *name, long sequence);
-extern void mrfree(mrlock_t *);
-
-#define mrinit(mrp, name) mrlock_init(mrp, MRLOCK_BARRIER, name, -1)
-#define mraccess(mrp) mraccessf(mrp, 0) /* grab for READ/ACCESS */
-#define mrupdate(mrp) mrupdatef(mrp, 0) /* grab for WRITE/UPDATE */
-#define mrislocked_access(mrp) ((mrp)->mr_count > 0)
-#define mrislocked_update(mrp) ((mrp)->mr_count < 0)
-
-#endif /* __XFS_SUPPORT_MRLOCK_H__ */
+++ /dev/null
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_MUTEX_H__
-#define __XFS_SUPPORT_MUTEX_H__
-
-#include <linux/spinlock.h>
-#include <asm/semaphore.h>
-
-/*
- * Map the mutex'es from IRIX to Linux semaphores.
- *
- * Destroy just simply initializes to -99 which should block all other
- * callers.
- */
-#define MUTEX_DEFAULT 0x0
-typedef struct semaphore mutex_t;
-
-#define mutex_init(lock, type, name) sema_init(lock, 1)
-#define mutex_destroy(lock) sema_init(lock, -99)
-#define mutex_lock(lock, num) down(lock)
-#define mutex_trylock(lock) (down_trylock(lock) ? 0 : 1)
-#define mutex_unlock(lock) up(lock)
-
-#endif /* __XFS_SUPPORT_MUTEX_H__ */
+++ /dev/null
-/*
- * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_SEMA_H__
-#define __XFS_SUPPORT_SEMA_H__
-
-#include <linux/time.h>
-#include <linux/wait.h>
-#include <asm/atomic.h>
-#include <asm/semaphore.h>
-
-/*
- * sema_t structure just maps to struct semaphore in Linux kernel.
- */
-
-typedef struct semaphore sema_t;
-
-#define init_sema(sp, val, c, d) sema_init(sp, val)
-#define initsema(sp, val) sema_init(sp, val)
-#define initnsema(sp, val, name) sema_init(sp, val)
-#define psema(sp, b) down(sp)
-#define vsema(sp) up(sp)
-#define valusema(sp) (atomic_read(&(sp)->count))
-#define freesema(sema)
-
-/*
- * Map cpsema (try to get the sema) to down_trylock. We need to switch
- * the return values since cpsema returns 1 (acquired) 0 (failed) and
- * down_trylock returns the reverse 0 (acquired) 1 (failed).
- */
-
-#define cpsema(sp) (down_trylock(sp) ? 0 : 1)
-
-/*
- * Didn't do cvsema(sp). Not sure how to map this to up/down/...
- * It does a vsema if the values is < 0 other wise nothing.
- */
-
-#endif /* __XFS_SUPPORT_SEMA_H__ */
+++ /dev/null
-/*
- * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_SPIN_H__
-#define __XFS_SUPPORT_SPIN_H__
-
-#include <linux/sched.h> /* preempt needs this */
-#include <linux/spinlock.h>
-
-/*
- * Map lock_t from IRIX to Linux spinlocks.
- *
- * Note that linux turns on/off spinlocks depending on CONFIG_SMP.
- * We don't need to worry about SMP or not here.
- */
-
-#define SPLDECL(s) unsigned long s
-
-typedef spinlock_t lock_t;
-
-#define spinlock_init(lock, name) spin_lock_init(lock)
-#define spinlock_destroy(lock)
-
-static inline unsigned long mutex_spinlock(lock_t *lock)
-{
- spin_lock(lock);
- return 0;
-}
-
-/*ARGSUSED*/
-static inline void mutex_spinunlock(lock_t *lock, unsigned long s)
-{
- spin_unlock(lock);
-}
-
-static inline void nested_spinlock(lock_t *lock)
-{
- spin_lock(lock);
-}
-
-static inline void nested_spinunlock(lock_t *lock)
-{
- spin_unlock(lock);
-}
-
-#endif /* __XFS_SUPPORT_SPIN_H__ */
+++ /dev/null
-/*
- * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-
-typedef struct sv_s {
- wait_queue_head_t waiters;
-} sv_t;
-
-#define SV_FIFO 0x0 /* sv_t is FIFO type */
-#define SV_LIFO 0x2 /* sv_t is LIFO type */
-#define SV_PRIO 0x4 /* sv_t is PRIO type */
-#define SV_KEYED 0x6 /* sv_t is KEYED type */
-#define SV_DEFAULT SV_FIFO
-
-
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
- unsigned long timeout)
-{
- DECLARE_WAITQUEUE(wait, current);
-
- add_wait_queue_exclusive(&sv->waiters, &wait);
- __set_current_state(state);
- spin_unlock(lock);
-
- schedule_timeout(timeout);
-
- remove_wait_queue(&sv->waiters, &wait);
-}
-
-#define init_sv(sv,type,name,flag) \
- init_waitqueue_head(&(sv)->waiters)
-#define sv_init(sv,flag,name) \
- init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
- /*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
- _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_wait_sig(sv, pri, lock, s) \
- _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
- _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
- _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_signal(sv) \
- wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
- wake_up_all(&(sv)->waiters)
-
-#endif /* __XFS_SUPPORT_SV_H__ */
+++ /dev/null
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_SUPPORT_TIME_H__
-#define __XFS_SUPPORT_TIME_H__
-
-#include <linux/sched.h>
-#include <linux/time.h>
-
-typedef struct timespec timespec_t;
-
-static inline void delay(long ticks)
-{
- current->state = TASK_UNINTERRUPTIBLE;
- schedule_timeout(ticks);
-}
-
-static inline void nanotime(struct timespec *tvp)
-{
- *tvp = CURRENT_TIME;
-}
-
-#endif /* __XFS_SUPPORT_TIME_H__ */
#include <linux/types.h>
#include <xfs_types.h>
#include <xfs_arch.h>
-#include "time.h"
+#include <time.h>
+#include <kmem.h>
+#include <mutex.h>
#include "uuid.h"
-#include "kmem.h"
#include "debug.h"
-#include "mutex.h"
static mutex_t uuid_monitor;
static int uuid_table_size;
#include <xfs_arch.h>
-#include <support/kmem.h>
-#include <support/mrlock.h>
#include <support/qsort.h>
-#include <support/spin.h>
-#include <support/sv.h>
#include <support/ktrace.h>
-#include <support/mutex.h>
-#include <support/sema.h>
#include <support/debug.h>
#include <support/move.h>
#include <support/uuid.h>
-#include <support/time.h>
#include <linux/xfs_linux.h>
--- /dev/null
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ *
+ */
+#include "xfs.h"
+
+/*
+ * Source file used to associate/disassociate behaviors with virtualized
+ * objects. See xfs_behavior.h for more information about behaviors, etc.
+ *
+ * The implementation is split between functions in this file and macros
+ * in xfs_behavior.h.
+ */
+
+/*
+ * Insert a new behavior descriptor into a behavior chain.
+ *
+ * The behavior chain is ordered based on the 'position' number which
+ * lives in the first field of the ops vector (higher numbers first).
+ *
+ * Attemps to insert duplicate ops result in an EINVAL return code.
+ * Otherwise, return 0 to indicate success.
+ */
+int
+bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp)
+{
+ bhv_desc_t *curdesc, *prev;
+ int position;
+
+ /*
+ * Validate the position value of the new behavior.
+ */
+ position = BHV_POSITION(bdp);
+ ASSERT(position >= BHV_POSITION_BASE && position <= BHV_POSITION_TOP);
+
+ /*
+ * Find location to insert behavior. Check for duplicates.
+ */
+ prev = NULL;
+ for (curdesc = bhp->bh_first;
+ curdesc != NULL;
+ curdesc = curdesc->bd_next) {
+
+ /* Check for duplication. */
+ if (curdesc->bd_ops == bdp->bd_ops) {
+ ASSERT(0);
+ return EINVAL;
+ }
+
+ /* Find correct position */
+ if (position >= BHV_POSITION(curdesc)) {
+ ASSERT(position != BHV_POSITION(curdesc));
+ break; /* found it */
+ }
+
+ prev = curdesc;
+ }
+
+ if (prev == NULL) {
+ /* insert at front of chain */
+ bdp->bd_next = bhp->bh_first;
+ bhp->bh_first = bdp;
+ } else {
+ /* insert after prev */
+ bdp->bd_next = prev->bd_next;
+ prev->bd_next = bdp;
+ }
+
+ return 0;
+}
+
+/*
+ * Remove a behavior descriptor from a position in a behavior chain;
+ * the postition is guaranteed not to be the first position.
+ * Should only be called by the bhv_remove() macro.
+ */
+void
+bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp)
+{
+ bhv_desc_t *curdesc, *prev;
+
+ ASSERT(bhp->bh_first != NULL);
+ ASSERT(bhp->bh_first->bd_next != NULL);
+
+ prev = bhp->bh_first;
+ for (curdesc = bhp->bh_first->bd_next;
+ curdesc != NULL;
+ curdesc = curdesc->bd_next) {
+
+ if (curdesc == bdp)
+ break; /* found it */
+ prev = curdesc;
+ }
+
+ ASSERT(curdesc == bdp);
+ prev->bd_next = bdp->bd_next; /* remove from after prev */
+}
+
+/*
+ * Look for a specific ops vector on the specified behavior chain.
+ * Return the associated behavior descriptor. Or NULL, if not found.
+ */
+bhv_desc_t *
+bhv_lookup(bhv_head_t *bhp, void *ops)
+{
+ bhv_desc_t *curdesc;
+
+ for (curdesc = bhp->bh_first;
+ curdesc != NULL;
+ curdesc = curdesc->bd_next) {
+
+ if (curdesc->bd_ops == ops)
+ return curdesc;
+ }
+
+ return NULL;
+}
+
+/*
+ * Looks for the first behavior within a specified range of positions.
+ * Return the associated behavior descriptor. Or NULL, if none found.
+ */
+bhv_desc_t *
+bhv_lookup_range(bhv_head_t *bhp, int low, int high)
+{
+ bhv_desc_t *curdesc;
+
+ for (curdesc = bhp->bh_first;
+ curdesc != NULL;
+ curdesc = curdesc->bd_next) {
+
+ int position = BHV_POSITION(curdesc);
+
+ if (position <= high) {
+ if (position >= low)
+ return curdesc;
+ return NULL;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Return the base behavior in the chain, or NULL if the chain
+ * is empty.
+ *
+ * The caller has not read locked the behavior chain, so acquire the
+ * lock before traversing the chain.
+ */
+bhv_desc_t *
+bhv_base(bhv_head_t *bhp)
+{
+ bhv_desc_t *curdesc;
+
+ for (curdesc = bhp->bh_first;
+ curdesc != NULL;
+ curdesc = curdesc->bd_next) {
+
+ if (curdesc->bd_next == NULL) {
+ return curdesc;
+ }
+ }
+
+ return NULL;
+}
+
+void
+bhv_head_init(
+ bhv_head_t *bhp,
+ char *name)
+{
+ bhp->bh_first = NULL;
+}
+
+void
+bhv_insert_initial(
+ bhv_head_t *bhp,
+ bhv_desc_t *bdp)
+{
+ ASSERT(bhp->bh_first == NULL);
+ (bhp)->bh_first = bdp;
+}
+
+void
+bhv_head_destroy(
+ bhv_head_t *bhp)
+{
+ ASSERT(bhp->bh_first == NULL);
+}
--- /dev/null
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BEHAVIOR_H__
+#define __XFS_BEHAVIOR_H__
+
+/*
+ * Header file used to associate behaviors with virtualized objects.
+ *
+ * A virtualized object is an internal, virtualized representation of
+ * OS entities such as persistent files, processes, or sockets. Examples
+ * of virtualized objects include vnodes, vprocs, and vsockets. Often
+ * a virtualized object is referred to simply as an "object."
+ *
+ * A behavior is essentially an implementation layer associated with
+ * an object. Multiple behaviors for an object are chained together,
+ * the order of chaining determining the order of invocation. Each
+ * behavior of a given object implements the same set of interfaces
+ * (e.g., the VOP interfaces).
+ *
+ * Behaviors may be dynamically inserted into an object's behavior chain,
+ * such that the addition is transparent to consumers that already have
+ * references to the object. Typically, a given behavior will be inserted
+ * at a particular location in the behavior chain. Insertion of new
+ * behaviors is synchronized with operations-in-progress (oip's) so that
+ * the oip's always see a consistent view of the chain.
+ *
+ * The term "interpostion" is used to refer to the act of inserting
+ * a behavior such that it interposes on (i.e., is inserted in front
+ * of) a particular other behavior. A key example of this is when a
+ * system implementing distributed single system image wishes to
+ * interpose a distribution layer (providing distributed coherency)
+ * in front of an object that is otherwise only accessed locally.
+ *
+ * Note that the traditional vnode/inode combination is simply a virtualized
+ * object that has exactly one associated behavior.
+ *
+ * Behavior synchronization is logic which is necessary under certain
+ * circumstances that there is no conflict between ongoing operations
+ * traversing the behavior chain and those dunamically modifying the
+ * behavior chain. Because behavior synchronization adds extra overhead
+ * to virtual operation invocation, we want to restrict, as much as
+ * we can, the requirement for this extra code, to those situations
+ * in which it is truly necessary.
+ *
+ * Behavior synchronization is needed whenever there's at least one class
+ * of object in the system for which:
+ * 1) multiple behaviors for a given object are supported,
+ * -- AND --
+ * 2a) insertion of a new behavior can happen dynamically at any time during
+ * the life of an active object,
+ * -- AND --
+ * 3a) insertion of a new behavior needs to synchronize with existing
+ * ops-in-progress.
+ * -- OR --
+ * 3b) multiple different behaviors can be dynamically inserted at
+ * any time during the life of an active object
+ * -- OR --
+ * 3c) removal of a behavior can occur at any time during the life of
+ * an active object.
+ * -- OR --
+ * 2b) removal of a behavior can occur at any time during the life of an
+ * active object
+ *
+ */
+
+struct bhv_head_lock;
+
+/*
+ * Behavior head. Head of the chain of behaviors.
+ * Contained within each virtualized object data structure.
+ */
+typedef struct bhv_head {
+ struct bhv_desc *bh_first; /* first behavior in chain */
+ struct bhv_head_lock *bh_lockp; /* pointer to lock info struct */
+} bhv_head_t;
+
+/*
+ * Behavior descriptor. Descriptor associated with each behavior.
+ * Contained within the behavior's private data structure.
+ */
+typedef struct bhv_desc {
+ void *bd_pdata; /* private data for this behavior */
+ void *bd_vobj; /* virtual object associated with */
+ void *bd_ops; /* ops for this behavior */
+ struct bhv_desc *bd_next; /* next behavior in chain */
+} bhv_desc_t;
+
+/*
+ * Behavior identity field. A behavior's identity determines the position
+ * where it lives within a behavior chain, and it's always the first field
+ * of the behavior's ops vector. The optional id field further identifies the
+ * subsystem responsible for the behavior.
+ */
+typedef struct bhv_identity {
+ __u16 bi_id; /* owning subsystem id */
+ __u16 bi_position; /* position in chain */
+} bhv_identity_t;
+
+typedef bhv_identity_t bhv_position_t;
+
+#define BHV_IDENTITY_INIT(id,pos) {id, pos}
+#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos)
+
+/*
+ * Define boundaries of position values.
+ */
+#define BHV_POSITION_INVALID 0 /* invalid position number */
+#define BHV_POSITION_BASE 1 /* base (last) implementation layer */
+#define BHV_POSITION_TOP 63 /* top (first) implementation layer */
+
+/*
+ * Plumbing macros.
+ */
+#define BHV_HEAD_FIRST(bhp) (ASSERT((bhp)->bh_first), (bhp)->bh_first)
+#define BHV_NEXT(bdp) (ASSERT((bdp)->bd_next), (bdp)->bd_next)
+#define BHV_NEXTNULL(bdp) ((bdp)->bd_next)
+#define BHV_VOBJ(bdp) (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj)
+#define BHV_VOBJNULL(bdp) ((bdp)->bd_vobj)
+#define BHV_PDATA(bdp) (bdp)->bd_pdata
+#define BHV_OPS(bdp) (bdp)->bd_ops
+#define BHV_IDENTITY(bdp) ((bhv_identity_t *)(bdp)->bd_ops)
+#define BHV_POSITION(bdp) (BHV_IDENTITY(bdp)->bi_position)
+
+extern void bhv_head_init(bhv_head_t *, char *);
+extern void bhv_head_destroy(bhv_head_t *);
+extern int bhv_insert(bhv_head_t *, bhv_desc_t *);
+extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *);
+
+/*
+ * Initialize a new behavior descriptor.
+ * Arguments:
+ * bdp - pointer to behavior descriptor
+ * pdata - pointer to behavior's private data
+ * vobj - pointer to associated virtual object
+ * ops - pointer to ops for this behavior
+ */
+#define bhv_desc_init(bdp, pdata, vobj, ops) \
+ { \
+ (bdp)->bd_pdata = pdata; \
+ (bdp)->bd_vobj = vobj; \
+ (bdp)->bd_ops = ops; \
+ (bdp)->bd_next = NULL; \
+ }
+
+/*
+ * Remove a behavior descriptor from a behavior chain.
+ */
+#define bhv_remove(bhp, bdp) \
+ { \
+ if ((bhp)->bh_first == (bdp)) { \
+ /* \
+ * Remove from front of chain. \
+ * Atomic wrt oip's. \
+ */ \
+ (bhp)->bh_first = (bdp)->bd_next; \
+ } else { \
+ /* remove from non-front of chain */ \
+ bhv_remove_not_first(bhp, bdp); \
+ } \
+ (bdp)->bd_vobj = NULL; \
+ }
+
+/*
+ * Behavior module prototypes.
+ */
+extern void bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp);
+extern bhv_desc_t * bhv_lookup(bhv_head_t *bhp, void *ops);
+extern bhv_desc_t * bhv_lookup_range(bhv_head_t *bhp, int low, int high);
+extern bhv_desc_t * bhv_base(bhv_head_t *bhp);
+
+/* No bhv locking on Linux */
+#define bhv_lookup_unlocked bhv_lookup
+#define bhv_base_unlocked bhv_base
+
+#endif /* __XFS_BEHAVIOR_H__ */
+++ /dev/null
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like. Any license provided herein, whether implied or
- * otherwise, applies only to this software file. Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA 94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-#ifndef __XFS_BUF_H__
-#define __XFS_BUF_H__
-
-/* These are just for xfs_syncsub... it sets an internal variable
- * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t
- */
-#define XFS_B_ASYNC PBF_ASYNC
-#define XFS_B_DELWRI PBF_DELWRI
-#define XFS_B_READ PBF_READ
-#define XFS_B_WRITE PBF_WRITE
-#define XFS_B_STALE PBF_STALE
-
-#define XFS_BUF_TRYLOCK PBF_TRYLOCK
-#define XFS_INCORE_TRYLOCK PBF_TRYLOCK
-#define XFS_BUF_LOCK PBF_LOCK
-#define XFS_BUF_MAPPED PBF_MAPPED
-
-#define BUF_BUSY PBF_DONT_BLOCK
-
-#define XFS_BUF_BFLAGS(x) ((x)->pb_flags)
-#define XFS_BUF_ZEROFLAGS(x) \
- ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_DELWRI))
-
-#define XFS_BUF_STALE(x) ((x)->pb_flags |= XFS_B_STALE)
-#define XFS_BUF_UNSTALE(x) ((x)->pb_flags &= ~XFS_B_STALE)
-#define XFS_BUF_ISSTALE(x) ((x)->pb_flags & XFS_B_STALE)
-#define XFS_BUF_SUPER_STALE(x) do { \
- XFS_BUF_STALE(x); \
- xfs_buf_undelay(x); \
- XFS_BUF_DONE(x); \
- } while (0)
-
-#define XFS_BUF_MANAGE PBF_FS_MANAGED
-#define XFS_BUF_UNMANAGE(x) ((x)->pb_flags &= ~PBF_FS_MANAGED)
-
-static inline void xfs_buf_undelay(page_buf_t *pb)
-{
- if (pb->pb_flags & PBF_DELWRI) {
- if (pb->pb_list.next != &pb->pb_list) {
- pagebuf_delwri_dequeue(pb);
- pagebuf_rele(pb);
- } else {
- pb->pb_flags &= ~PBF_DELWRI;
- }
- }
-}
-
-#define XFS_BUF_DELAYWRITE(x) ((x)->pb_flags |= PBF_DELWRI)
-#define XFS_BUF_UNDELAYWRITE(x) xfs_buf_undelay(x)
-#define XFS_BUF_ISDELAYWRITE(x) ((x)->pb_flags & PBF_DELWRI)
-
-#define XFS_BUF_ERROR(x,no) pagebuf_ioerror(x,no)
-#define XFS_BUF_GETERROR(x) pagebuf_geterror(x)
-#define XFS_BUF_ISERROR(x) (pagebuf_geterror(x)?1:0)
-
-#define XFS_BUF_DONE(x) ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE))
-#define XFS_BUF_UNDONE(x) ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE)
-#define XFS_BUF_ISDONE(x) (!(PBF_NOT_DONE(x)))
-
-#define XFS_BUF_BUSY(x) ((x)->pb_flags |= PBF_FORCEIO)
-#define XFS_BUF_UNBUSY(x) ((x)->pb_flags &= ~PBF_FORCEIO)
-#define XFS_BUF_ISBUSY(x) (1)
-
-#define XFS_BUF_ASYNC(x) ((x)->pb_flags |= PBF_ASYNC)
-#define XFS_BUF_UNASYNC(x) ((x)->pb_flags &= ~PBF_ASYNC)
-#define XFS_BUF_ISASYNC(x) ((x)->pb_flags & PBF_ASYNC)
-
-#define XFS_BUF_FLUSH(x) ((x)->pb_flags |= PBF_FLUSH)
-#define XFS_BUF_UNFLUSH(x) ((x)->pb_flags &= ~PBF_FLUSH)
-#define XFS_BUF_ISFLUSH(x) ((x)->pb_flags & PBF_FLUSH)
-
-#define XFS_BUF_SHUT(x) printk("XFS_BUF_SHUT not implemented yet\n")
-#define XFS_BUF_UNSHUT(x) printk("XFS_BUF_UNSHUT not implemented yet\n")
-#define XFS_BUF_ISSHUT(x) (0)
-
-#define XFS_BUF_HOLD(x) pagebuf_hold(x)
-#define XFS_BUF_READ(x) ((x)->pb_flags |= PBF_READ)
-#define XFS_BUF_UNREAD(x) ((x)->pb_flags &= ~PBF_READ)
-#define XFS_BUF_ISREAD(x) ((x)->pb_flags & PBF_READ)
-
-#define XFS_BUF_WRITE(x) ((x)->pb_flags |= PBF_WRITE)
-#define XFS_BUF_UNWRITE(x) ((x)->pb_flags &= ~PBF_WRITE)
-#define XFS_BUF_ISWRITE(x) ((x)->pb_flags & PBF_WRITE)
-
-#define XFS_BUF_ISUNINITIAL(x) (0)
-#define XFS_BUF_UNUNINITIAL(x) (0)
-
-#define XFS_BUF_BP_ISMAPPED(bp) 1
-
-typedef struct page_buf_s xfs_buf_t;
-#define xfs_buf page_buf_s
-
-typedef struct pb_target xfs_buftarg_t;
-#define xfs_buftarg pb_target
-
-#define XFS_BUF_DATAIO(x) ((x)->pb_flags |= PBF_FS_DATAIOD)
-#define XFS_BUF_UNDATAIO(x) ((x)->pb_flags &= ~PBF_FS_DATAIOD)
-
-#define XFS_BUF_IODONE_FUNC(buf) (buf)->pb_iodone
-#define XFS_BUF_SET_IODONE_FUNC(buf, func) \
- (buf)->pb_iodone = (func)
-#define XFS_BUF_CLR_IODONE_FUNC(buf) \
- (buf)->pb_iodone = NULL
-#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func) \
- (buf)->pb_strat = (func)
-#define XFS_BUF_CLR_BDSTRAT_FUNC(buf) \
- (buf)->pb_strat = NULL
-
-#define XFS_BUF_FSPRIVATE(buf, type) \
- ((type)(buf)->pb_fspriv)
-#define XFS_BUF_SET_FSPRIVATE(buf, value) \
- (buf)->pb_fspriv = (void *)(value)
-#define XFS_BUF_FSPRIVATE2(buf, type) \
- ((type)(buf)->pb_fspriv2)
-#define XFS_BUF_SET_FSPRIVATE2(buf, value) \
- (buf)->pb_fspriv2 = (void *)(value)
-#define XFS_BUF_FSPRIVATE3(buf, type) \
- ((type)(buf)->pb_fspriv3)
-#define XFS_BUF_SET_FSPRIVATE3(buf, value) \
- (buf)->pb_fspriv3 = (void *)(value)
-#define XFS_BUF_SET_START(buf)
-
-#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \
- (buf)->pb_relse = (value)
-
-#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->pb_addr)
-
-extern inline xfs_caddr_t xfs_buf_offset(page_buf_t *bp, size_t offset)
-{
- if (bp->pb_flags & PBF_MAPPED)
- return XFS_BUF_PTR(bp) + offset;
- return (xfs_caddr_t) pagebuf_offset(bp, offset);
-}
-
-#define XFS_BUF_SET_PTR(bp, val, count) \
- pagebuf_associate_memory(bp, val, count)
-#define XFS_BUF_ADDR(bp) ((bp)->pb_bn)
-#define XFS_BUF_SET_ADDR(bp, blk) \
- ((bp)->pb_bn = (page_buf_daddr_t)(blk))
-#define XFS_BUF_OFFSET(bp) ((bp)->pb_file_offset)
-#define XFS_BUF_SET_OFFSET(bp, off) \
- ((bp)->pb_file_offset = (off))
-#define XFS_BUF_COUNT(bp) ((bp)->pb_count_desired)
-#define XFS_BUF_SET_COUNT(bp, cnt) \
- ((bp)->pb_count_desired = (cnt))
-#define XFS_BUF_SIZE(bp) ((bp)->pb_buffer_length)
-#define XFS_BUF_SET_SIZE(bp, cnt) \
- ((bp)->pb_buffer_length = (cnt))
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
-#define XFS_BUF_SET_VTYPE(bp, type)
-#define XFS_BUF_SET_REF(bp, ref)
-
-#define XFS_BUF_ISPINNED(bp) pagebuf_ispin(bp)
-
-#define XFS_BUF_VALUSEMA(bp) pagebuf_lock_value(bp)
-#define XFS_BUF_CPSEMA(bp) (pagebuf_cond_lock(bp) == 0)
-#define XFS_BUF_VSEMA(bp) pagebuf_unlock(bp)
-#define XFS_BUF_PSEMA(bp,x) pagebuf_lock(bp)
-#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema);
-
-/* setup the buffer target from a buftarg structure */
-#define XFS_BUF_SET_TARGET(bp, target) \
- (bp)->pb_target = (target)
-#define XFS_BUF_TARGET(bp) ((bp)->pb_target)
-#define XFS_BUFTARG_NAME(target) \
- pagebuf_target_name(target)
-
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
-#define XFS_BUF_SET_VTYPE(bp, type)
-#define XFS_BUF_SET_REF(bp, ref)
-
-#define xfs_buf_read(target, blkno, len, flags) \
- pagebuf_get((target), (blkno), (len), \
- PBF_LOCK | PBF_READ | PBF_MAPPED | PBF_MAPPABLE)
-#define xfs_buf_get(target, blkno, len, flags) \
- pagebuf_get((target), (blkno), (len), \
- PBF_LOCK | PBF_MAPPED | PBF_MAPPABLE)
-
-#define xfs_buf_read_flags(target, blkno, len, flags) \
- pagebuf_get((target), (blkno), (len), \
- PBF_READ | PBF_MAPPABLE | flags)
-#define xfs_buf_get_flags(target, blkno, len, flags) \
- pagebuf_get((target), (blkno), (len), \
- PBF_MAPPABLE | flags)
-
-static inline int xfs_bawrite(void *mp, page_buf_t *bp)
-{
- bp->pb_fspriv3 = mp;
- bp->pb_strat = xfs_bdstrat_cb;
- xfs_buf_undelay(bp);
- return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | PBF_RUN_QUEUES);
-}
-
-static inline void xfs_buf_relse(page_buf_t *bp)
-{
- if ((bp->pb_flags & _PBF_LOCKABLE) && !bp->pb_relse)
- pagebuf_unlock(bp);
- pagebuf_rele(bp);
-}
-
-#define xfs_bpin(bp) pagebuf_pin(bp)
-#define xfs_bunpin(bp) pagebuf_unpin(bp)
-
-#define xfs_buftrace(id, bp) \
- pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
-
-#define xfs_biodone(pb) \
- pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), 0)
-
-#define xfs_incore(buftarg,blkno,len,lockit) \
- pagebuf_find(buftarg, blkno ,len, lockit)
-
-
-#define xfs_biomove(pb, off, len, data, rw) \
- pagebuf_iomove((pb), (off), (len), (data), \
- ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ)
-
-#define xfs_biozero(pb, off, len) \
- pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO)
-
-
-static inline int XFS_bwrite(page_buf_t *pb)
-{
- int iowait = (pb->pb_flags & PBF_ASYNC) == 0;
- int error = 0;
-
- pb->pb_flags |= PBF_SYNC;
- if (!iowait)
- pb->pb_flags |= PBF_RUN_QUEUES;
-
- xfs_buf_undelay(pb);
- pagebuf_iostrategy(pb);
- if (iowait) {
- error = pagebuf_iowait(pb);
- xfs_buf_relse(pb);
- }
- return error;
-}
-
-#define XFS_bdwrite(pb) \
- pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC)
-
-static inline int xfs_bdwrite(void *mp, page_buf_t *bp)
-{
- bp->pb_strat = xfs_bdstrat_cb;
- bp->pb_fspriv3 = mp;
-
- return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC);
-}
-
-#define XFS_bdstrat(bp) pagebuf_iorequest(bp)
-
-#define xfs_iowait(pb) pagebuf_iowait(pb)
-
-
-/*
- * Go through all incore buffers, and release buffers
- * if they belong to the given device. This is used in
- * filesystem error handling to preserve the consistency
- * of its metadata.
- */
-
-#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg)
-
-#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg)
-
-#define xfs_incore_relse(buftarg,delwri_only,wait) \
- xfs_relse_buftarg(buftarg)
-
-#define xfs_baread(target, rablkno, ralen) \
- pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK)
-
-#define xfs_buf_get_empty(len, target) pagebuf_get_empty((len), (target))
-#define xfs_buf_get_noaddr(len, target) pagebuf_get_no_daddr((len), (target))
-#define xfs_buf_free(bp) pagebuf_free(bp)
-
-#endif /* __XFS_BUF_H__ */
--- /dev/null
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like. Any license provided herein, whether implied or
+ * otherwise, applies only to this software file. Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA 94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "xfs.h"
+
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_utils.h"
+#include "xfs_iomap.h"
+
+#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
+ << mp->m_writeio_log)
+#define XFS_STRAT_WRITE_IMAPS 2
+#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
+
+STATIC int
+xfs_imap_to_bmap(
+ xfs_iocore_t *io,
+ xfs_off_t offset,
+ xfs_bmbt_irec_t *imap,
+ xfs_iomap_t *iomapp,
+ int imaps, /* Number of imap entries */
+ int iomaps, /* Number of iomap entries */
+ int flags)
+{
+ xfs_mount_t *mp;
+ xfs_fsize_t nisize;
+ int pbm;
+ xfs_fsblock_t start_block;
+
+ mp = io->io_mount;
+ nisize = XFS_SIZE(mp, io);
+ if (io->io_new_size > nisize)
+ nisize = io->io_new_size;
+
+ for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
+ iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ?
+ mp->m_rtdev_targp : mp->m_ddev_targp;
+ iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+ iomapp->iomap_delta = offset - iomapp->iomap_offset;
+ iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
+ iomapp->iomap_flags = flags;
+
+ start_block = imap->br_startblock;
+ if (start_block == HOLESTARTBLOCK) {
+ iomapp->iomap_bn = IOMAP_DADDR_NULL;
+ iomapp->iomap_flags = IOMAP_HOLE;
+ } else if (start_block == DELAYSTARTBLOCK) {
+ iomapp->iomap_bn = IOMAP_DADDR_NULL;
+ iomapp->iomap_flags = IOMAP_DELAY;
+ } else {
+ iomapp->iomap_bn = XFS_FSB_TO_DB_IO(io, start_block);
+ if (ISUNWRITTEN(imap))
+ iomapp->iomap_flags |= IOMAP_UNWRITTEN;
+ }
+
+ if ((iomapp->iomap_offset + iomapp->iomap_bsize) >= nisize) {
+ iomapp->iomap_flags |= IOMAP_EOF;
+ }
+
+ offset += iomapp->iomap_bsize - iomapp->iomap_delta;
+ }
+ return pbm; /* Return the number filled */
+}
+
+int
+xfs_iomap(
+ xfs_iocore_t *io,
+ xfs_off_t offset,
+ ssize_t count,
+ int flags,
+ xfs_iomap_t *iomapp,
+ int *niomaps)
+{
+ xfs_mount_t *mp = io->io_mount;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ int error = 0;
+ int lockmode = 0;
+ xfs_bmbt_irec_t imap;
+ int nimaps = 1;
+ int bmapi_flags = 0;
+ int iomap_flags = 0;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ switch (flags &
+ (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE |
+ BMAPI_UNWRITTEN | BMAPI_DEVICE)) {
+ case BMAPI_READ:
+ lockmode = XFS_LCK_MAP_SHARED(mp, io);
+ bmapi_flags = XFS_BMAPI_ENTIRE;
+ if (flags & BMAPI_IGNSTATE)
+ bmapi_flags |= XFS_BMAPI_IGSTATE;
+ break;
+ case BMAPI_WRITE:
+ lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR;
+ bmapi_flags = 0;
+ XFS_ILOCK(mp, io, lockmode);
+ break;
+ case BMAPI_ALLOCATE:
+ lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD;
+ bmapi_flags = XFS_BMAPI_ENTIRE;
+ /* Attempt non-blocking lock */
+ if (flags & BMAPI_TRYLOCK) {
+ if (!XFS_ILOCK_NOWAIT(mp, io, lockmode))
+ return XFS_ERROR(EAGAIN);
+ } else {
+ XFS_ILOCK(mp, io, lockmode);
+ }
+ break;
+ case BMAPI_UNWRITTEN:
+ goto phase2;
+ case BMAPI_DEVICE:
+ lockmode = XFS_LCK_MAP_SHARED(mp, io);
+ iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ?
+ mp->m_rtdev_targp : mp->m_ddev_targp;
+ error = 0;
+ *niomaps = 1;
+ goto out;
+ default:
+ BUG();
+ }
+
+ ASSERT(offset <= mp->m_maxioffset);
+ if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
+ count = mp->m_maxioffset - offset;
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+ error = XFS_BMAPI(mp, NULL, io, offset_fsb,
+ (xfs_filblks_t)(end_fsb - offset_fsb),
+ bmapi_flags, NULL, 0, &imap,
+ &nimaps, NULL);
+
+ if (error)
+ goto out;
+
+phase2:
+ switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) {
+ case BMAPI_WRITE:
+ /* If we found an extent, return it */
+ if (nimaps && (imap.br_startblock != HOLESTARTBLOCK))
+ break;
+
+ if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
+ error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset,
+ count, flags, &imap, &nimaps, nimaps);
+ } else {
+ error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count,
+ flags, &imap, &nimaps);
+ }
+ iomap_flags = IOMAP_NEW;
+ break;
+ case BMAPI_ALLOCATE:
+ /* If we found an extent, return it */
+ XFS_IUNLOCK(mp, io, lockmode);
+ lockmode = 0;
+
+ if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock))
+ break;
+
+ error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, &imap, &nimaps);
+ break;
+ case BMAPI_UNWRITTEN:
+ lockmode = 0;
+ error = XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count);
+ nimaps = 0;
+ break;
+ }
+
+ if (nimaps) {
+ *niomaps = xfs_imap_to_bmap(io, offset, &imap,
+ iomapp, nimaps, *niomaps, iomap_flags);
+ } else if (niomaps) {
+ *niomaps = 0;
+ }
+
+out:
+ if (lockmode)
+ XFS_IUNLOCK(mp, io, lockmode);
+ return XFS_ERROR(error);
+}
+
+STATIC int
+xfs_flush_space(
+ xfs_inode_t *ip,
+ int *fsynced,
+ int *ioflags)
+{
+ switch (*fsynced) {
+ case 0:
+ if (ip->i_delayed_blks) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_flush_inode(ip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ *fsynced = 1;
+ } else {
+ *ioflags |= BMAPI_SYNC;
+ *fsynced = 2;
+ }
+ return 0;
+ case 1:
+ *fsynced = 2;
+ *ioflags |= BMAPI_SYNC;
+ return 0;
+ case 2:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_flush_device(ip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ *fsynced = 3;
+ return 0;
+ }
+ return 1;
+}
+
+int
+xfs_iomap_write_direct(
+ xfs_inode_t *ip,
+ loff_t offset,
+ size_t count,
+ int flags,
+ xfs_bmbt_irec_t *ret_imap,
+ int *nmaps,
+ int found)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_iocore_t *io = &ip->i_iocore;
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_filblks_t count_fsb;
+ xfs_fsize_t isize;
+ xfs_fsblock_t firstfsb;
+ int nimaps, maps;
+ int error;
+ int bmapi_flag;
+ int rt;
+ xfs_trans_t *tp;
+ xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp;
+ xfs_bmap_free_t free_list;
+ int aeof;
+ xfs_filblks_t datablocks;
+ int committed;
+ int numrtextents;
+ uint resblks;
+
+ /*
+ * Make sure that the dquots are there. This doesn't hold
+ * the ilock across a disk read.
+ */
+
+ error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED);
+ if (error)
+ return XFS_ERROR(error);
+
+ maps = min(XFS_WRITE_IMAPS, *nmaps);
+ nimaps = maps;
+
+ isize = ip->i_d.di_size;
+ aeof = (offset + count) > isize;
+
+ if (io->io_new_size > isize)
+ isize = io->io_new_size;
+
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+ count_fsb = last_fsb - offset_fsb;
+ if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) {
+ xfs_fileoff_t map_last_fsb;
+
+ map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff;
+
+ if (map_last_fsb < last_fsb) {
+ last_fsb = map_last_fsb;
+ count_fsb = last_fsb - offset_fsb;
+ }
+ ASSERT(count_fsb > 0);
+ }
+
+ /*
+ * determine if reserving space on
+ * the data or realtime partition.
+ */
+ if ((rt = XFS_IS_REALTIME_INODE(ip))) {
+ int sbrtextsize, iprtextsize;
+
+ sbrtextsize = mp->m_sb.sb_rextsize;
+ iprtextsize =
+ ip->i_d.di_extsize ? ip->i_d.di_extsize : sbrtextsize;
+ numrtextents = (count_fsb + iprtextsize - 1);
+ do_div(numrtextents, sbrtextsize);
+ datablocks = 0;
+ } else {
+ datablocks = count_fsb;
+ numrtextents = 0;
+ }
+
+ /*
+ * allocate and setup the transaction
+ */
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
+
+ error = xfs_trans_reserve(tp, resblks,
+ XFS_WRITE_LOG_RES(mp), numrtextents,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+
+ /*
+ * check for running out of space
+ */
+ if (error)
+ /*
+ * Free the transaction structure.
+ */
+ xfs_trans_cancel(tp, 0);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ if (error)
+ goto error_out; /* Don't return in above if .. trans ..,
+ need lock to return */
+
+ if (XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, resblks)) {
+ error = (EDQUOT);
+ goto error1;
+ }
+ nimaps = 1;
+
+ bmapi_flag = XFS_BMAPI_WRITE;
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ihold(tp, ip);
+
+ if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt))
+ bmapi_flag |= XFS_BMAPI_PREALLOC;
+
+ /*
+ * issue the bmapi() call to allocate the blocks
+ */
+ XFS_BMAP_INIT(&free_list, &firstfsb);
+ imapp = &imap[0];
+ error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+ bmapi_flag, &firstfsb, 0, imapp, &nimaps, &free_list);
+ if (error) {
+ goto error0;
+ }
+
+ /*
+ * complete the transaction
+ */
+
+ error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
+ if (error) {
+ goto error0;
+ }
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+ if (error) {
+ goto error_out;
+ }
+
+ /* copy any maps to caller's array and return any error. */
+ if (nimaps == 0) {
+ error = (ENOSPC);
+ goto error_out;
+ }
+
+ *ret_imap = imap[0];
+ *nmaps = 1;
+ return 0;
+
+ error0: /* Cancel bmap, unlock inode, and cancel trans */
+ xfs_bmap_cancel(&free_list);
+
+ error1: /* Just cancel transaction */
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ *nmaps = 0; /* nothing set-up here */
+
+error_out:
+ return XFS_ERROR(error);
+}
+
+int
+xfs_iomap_write_delay(
+ xfs_inode_t *ip,
+ loff_t offset,
+ size_t count,
+ int ioflag,
+ xfs_bmbt_irec_t *ret_imap,
+ int *nmaps)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_iocore_t *io = &ip->i_iocore;
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_fsize_t isize;
+ xfs_fsblock_t firstblock;
+ int nimaps;
+ int error;
+ xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
+ int aeof;
+ int fsynced = 0;
+
+ ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
+
+ /*
+ * Make sure that the dquots are there. This doesn't hold
+ * the ilock across a disk read.
+ */
+
+ error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+ if (error)
+ return XFS_ERROR(error);
+
+retry:
+ isize = ip->i_d.di_size;
+ if (io->io_new_size > isize) {
+ isize = io->io_new_size;
+ }
+
+ aeof = 0;
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+ /*
+ * If the caller is doing a write at the end of the file,
+ * then extend the allocation (and the buffer used for the write)
+ * out to the file system's write iosize. We clean up any extra
+ * space left over when the file is closed in xfs_inactive().
+ *
+ * We don't bother with this for sync writes, because we need
+ * to minimize the amount we write for good performance.
+ */
+ if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) {
+ xfs_off_t aligned_offset;
+ unsigned int iosize;
+ xfs_fileoff_t ioalign;
+
+ iosize = mp->m_writeio_blocks;
+ aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
+ ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
+ last_fsb = ioalign + iosize;
+ aeof = 1;
+ }
+
+ nimaps = XFS_WRITE_IMAPS;
+ firstblock = NULLFSBLOCK;
+
+ /*
+ * roundup the allocation request to m_dalign boundary if file size
+ * is greater that 512K and we are allocating past the allocation eof
+ */
+ if (mp->m_dalign && (isize >= mp->m_dalign) && aeof) {
+ int eof;
+ xfs_fileoff_t new_last_fsb;
+ new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
+ error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
+ if (error) {
+ return error;
+ }
+ if (eof) {
+ last_fsb = new_last_fsb;
+ }
+ }
+
+ error = xfs_bmapi(NULL, ip, offset_fsb,
+ (xfs_filblks_t)(last_fsb - offset_fsb),
+ XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
+ XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
+ &nimaps, NULL);
+ /*
+ * This can be EDQUOT, if nimaps == 0
+ */
+ if (error && (error != ENOSPC)) {
+ return XFS_ERROR(error);
+ }
+ /*
+ * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+ * then we must have run out of space.
+ */
+
+ if (nimaps == 0) {
+ if (xfs_flush_space(ip, &fsynced, &ioflag))
+ return XFS_ERROR(ENOSPC);
+
+ error = 0;
+ goto retry;
+ }
+
+ *ret_imap = imap[0];
+ *nmaps = 1;
+ return 0;
+}
+
+/*
+ * Pass in a delayed allocate extent, convert it to real extents;
+ * return to the caller the extent we create which maps on top of
+ * the originating callers request.
+ *
+ * Called without a lock on the inode.
+ */
+int
+xfs_iomap_write_allocate(
+ xfs_inode_t *ip,
+ xfs_bmbt_irec_t *map,
+ int *retmap)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb, last_block;
+ xfs_fileoff_t end_fsb, map_start_fsb;
+ xfs_fsblock_t first_block;
+ xfs_bmap_free_t free_list;
+ xfs_filblks_t count_fsb;
+ xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS];
+ xfs_trans_t *tp;
+ int i, nimaps, committed;
+ int error = 0;
+ int nres;
+
+ *retmap = 0;
+
+ /*
+ * Make sure that the dquots are there.
+ */
+
+ if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+ return XFS_ERROR(error);
+
+ offset_fsb = map->br_startoff;
+ count_fsb = map->br_blockcount;
+ map_start_fsb = offset_fsb;
+
+ XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
+
+ while (count_fsb != 0) {
+ /*
+ * Set up a transaction with which to allocate the
+ * backing store for the file. Do allocations in a
+ * loop until we get some space in the range we are
+ * interested in. The other space that might be allocated
+ * is in the delayed allocation extent on which we sit
+ * but before our buffer starts.
+ */
+
+ nimaps = 0;
+ while (nimaps == 0) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+ nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ error = xfs_trans_reserve(tp, nres,
+ XFS_WRITE_LOG_RES(mp),
+ 0, XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+
+ if (error == ENOSPC) {
+ error = xfs_trans_reserve(tp, 0,
+ XFS_WRITE_LOG_RES(mp),
+ 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+ }
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return XFS_ERROR(error);
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ihold(tp, ip);
+
+ XFS_BMAP_INIT(&free_list, &first_block);
+
+ nimaps = XFS_STRAT_WRITE_IMAPS;
+ /*
+ * Ensure we don't go beyond eof - it is possible
+ * the extents changed since we did the read call,
+ * we dropped the ilock in the interim.
+ */
+
+ end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size);
+ xfs_bmap_last_offset(NULL, ip, &last_block,
+ XFS_DATA_FORK);
+ last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
+ if ((map_start_fsb + count_fsb) > last_block) {
+ count_fsb = last_block - map_start_fsb;
+ if (count_fsb == 0) {
+ error = EAGAIN;
+ goto trans_cancel;
+ }
+ }
+
+ /* Go get the actual blocks */
+ error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
+ XFS_BMAPI_WRITE, &first_block, 1,
+ imap, &nimaps, &free_list);
+
+ if (error)
+ goto trans_cancel;
+
+ error = xfs_bmap_finish(&tp, &free_list,
+ first_block, &committed);
+
+ if (error)
+ goto trans_cancel;
+
+ error = xfs_trans_commit(tp,
+ XFS_TRANS_RELEASE_LOG_RES, NULL);
+
+ if (error)
+ goto error0;
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ /*
+ * See if we were able to allocate an extent that
+ * covers at least part of the callers request
+ */
+
+ for (i = 0; i < nimaps; i++) {
+ if ((map->br_startoff >= imap[i].br_startoff) &&
+ (map->br_startoff < (imap[i].br_startoff +
+ imap[i].br_blockcount))) {
+ *map = imap[i];
+ *retmap = 1;
+ XFS_STATS_INC(xs_xstrat_quick);
+ return 0;
+ }
+ count_fsb -= imap[i].br_blockcount;
+ }
+
+ /* So far we have not mapped the requested part of the
+ * file, just surrounding data, try again.
+ */
+ nimaps--;
+ offset_fsb = imap[nimaps].br_startoff +
+ imap[nimaps].br_blockcount;
+ map_start_fsb = offset_fsb;
+ }
+
+trans_cancel:
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error0:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return XFS_ERROR(error);
+}
+
+int
+xfs_iomap_write_unwritten(
+ xfs_inode_t *ip,
+ loff_t offset,
+ size_t count)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_trans_t *tp;
+ xfs_fileoff_t offset_fsb;
+ xfs_filblks_t count_fsb;
+ xfs_filblks_t numblks_fsb;
+ xfs_bmbt_irec_t imap;
+ int committed;
+ int error;
+ int nres;
+ int nimaps;
+ xfs_fsblock_t firstfsb;
+ xfs_bmap_free_t free_list;
+
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ count_fsb = XFS_B_TO_FSB(mp, count);
+
+ do {
+ nres = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+
+ /*
+ * set up a transaction to convert the range of extents
+ * from unwritten to real. Do allocations in a loop until
+ * we have covered the range passed in.
+ */
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+ error = xfs_trans_reserve(tp, nres,
+ XFS_WRITE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto error0;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ihold(tp, ip);
+
+ /*
+ * Modify the unwritten extent state of the buffer.
+ */
+ XFS_BMAP_INIT(&free_list, &firstfsb);
+ nimaps = 1;
+ error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+ XFS_BMAPI_WRITE, &firstfsb,
+ 1, &imap, &nimaps, &free_list);
+ if (error)
+ goto error_on_bmapi_transaction;
+
+ error = xfs_bmap_finish(&(tp), &(free_list),
+ firstfsb, &committed);
+ if (error)
+ goto error_on_bmapi_transaction;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ goto error0;
+
+ if ((numblks_fsb = imap.br_blockcount) == 0) {
+ /*
+ * The numblks_fsb value should always get
+ * smaller, otherwise the loop is stuck.
+ */
+ ASSERT(imap.br_blockcount);
+ break;
+ }
+ offset_fsb += numblks_fsb;
+ count_fsb -= numblks_fsb;
+ } while (count_fsb > 0);
+
+ return 0;
+
+error_on_bmapi_transaction:
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+error0:
+ return XFS_ERROR(error);
+}