]> git.neil.brown.name Git - history.git/commitdiff
Import 2.4.0-test6pre5 2.4.0-test6pre5
authorLinus Torvalds <torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:37:30 +0000 (15:37 -0500)
committerLinus Torvalds <torvalds@linuxfoundation.org>
Fri, 23 Nov 2007 20:37:30 +0000 (15:37 -0500)
fs/ext2/inode.c

index 5dfe2cf55edd1a61c4678ff97f69ccba7bc39e28..4f0b78da37e7ea298cfadb0e18890cb43071f81e 100644 (file)
@@ -18,6 +18,8 @@
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  *     (jj@sunsite.ms.mff.cuni.cz)
+ *
+ *  Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
  */
 
 #include <linux/fs.h>
@@ -26,8 +28,6 @@
 #include <linux/sched.h>
 #include <linux/highuid.h>
 
-
-
 static int ext2_update_inode(struct inode * inode, int do_sync);
 
 /*
@@ -64,23 +64,18 @@ no_delete:
        clear_inode(inode);     /* We must guarantee clearing of inode... */
 }
 
-/* 
- * ext2_discard_prealloc and ext2_alloc_block are atomic wrt. the
- * superblock in the same manner as are ext2_free_blocks and
- * ext2_new_block.  We just wait on the super rather than locking it
- * here, since ext2_new_block will do the necessary locking and we
- * can't block until then.
- */
 void ext2_discard_prealloc (struct inode * inode)
 {
 #ifdef EXT2_PREALLOCATE
-       unsigned short total;
-
        lock_kernel();
+       /* Writer: ->i_prealloc* */
        if (inode->u.ext2_i.i_prealloc_count) {
-               total = inode->u.ext2_i.i_prealloc_count;
+               unsigned short total = inode->u.ext2_i.i_prealloc_count;
+               unsigned long block = inode->u.ext2_i.i_prealloc_block;
                inode->u.ext2_i.i_prealloc_count = 0;
-               ext2_free_blocks (inode, inode->u.ext2_i.i_prealloc_block, total);
+               inode->u.ext2_i.i_prealloc_block = 0;
+               /* Writer: end */
+               ext2_free_blocks (inode, block, total);
        }
        unlock_kernel();
 #endif
@@ -93,22 +88,26 @@ static int ext2_alloc_block (struct inode * inode, unsigned long goal, int *err)
 #endif
        unsigned long result;
 
-       wait_on_super (inode->i_sb);
 
 #ifdef EXT2_PREALLOCATE
+       /* Writer: ->i_prealloc* */
        if (inode->u.ext2_i.i_prealloc_count &&
            (goal == inode->u.ext2_i.i_prealloc_block ||
             goal + 1 == inode->u.ext2_i.i_prealloc_block))
        {               
                result = inode->u.ext2_i.i_prealloc_block++;
                inode->u.ext2_i.i_prealloc_count--;
+               /* Writer: end */
+#ifdef EXT2FS_DEBUG
                ext2_debug ("preallocation hit (%lu/%lu).\n",
                            ++alloc_hits, ++alloc_attempts);
-
+#endif
        } else {
                ext2_discard_prealloc (inode);
+#ifdef EXT2FS_DEBUG
                ext2_debug ("preallocation miss (%lu/%lu).\n",
                            alloc_hits, ++alloc_attempts);
+#endif
                if (S_ISREG(inode->i_mode))
                        result = ext2_new_block (inode, goal, 
                                 &inode->u.ext2_i.i_prealloc_count,
@@ -299,307 +298,309 @@ no_block:
        return p;
 }
 
-static struct buffer_head * inode_getblk (struct inode * inode, int nr,
-       int new_block, int * err, int metadata, long *phys, int *new)
+/**
+ *     ext2_find_near - find a place for allocation with sufficient locality
+ *     @inode: owner
+ *     @ind: descriptor of indirect block.
+ *
+ *     This function returns the prefered place for block allocation.
+ *     It is used when heuristic for sequential allocation fails.
+ *     Rules are:
+ *       + if there is a block to the left of our position - allocate near it.
+ *       + if pointer will live in indirect block - allocate near that block.
+ *       + if pointer will live in inode - allocate in the same cylinder group.
+ *     Caller must make sure that @ind is valid and will stay that way.
+ */
+
+static inline unsigned long ext2_find_near(struct inode *inode, Indirect *ind)
 {
-       u32 * p;
-       int tmp, goal = 0;
-       struct buffer_head * result;
-       int blocksize = inode->i_sb->s_blocksize;
+       u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext2_i.i_data;
+       u32 *p;
 
-       p = inode->u.ext2_i.i_data + nr;
-repeat:
-       tmp = le32_to_cpu(*p);
-       if (tmp) {
-               if (metadata) {
-                       result = getblk (inode->i_dev, tmp, blocksize);
-                       if (tmp == le32_to_cpu(*p))
-                               return result;
-                       brelse (result);
-                       goto repeat;
-               } else {
-                       *phys = tmp;
-                       return NULL;
-               }
-       }
+       /* Try to find previous block */
+       for (p = ind->p - 1; p >= start; p--)
+               if (*p)
+                       return le32_to_cpu(*p);
 
-       if (inode->u.ext2_i.i_next_alloc_block == new_block)
-               goal = inode->u.ext2_i.i_next_alloc_goal;
+       /* No such thing, so let's try location of indirect block */
+       if (ind->bh)
+               return ind->bh->b_blocknr;
 
-       ext2_debug ("hint = %d,", goal);
+       /*
+        * It is going to be refered from inode itself? OK, just put it into
+        * the same cylinder group then.
+        */
+       return (inode->u.ext2_i.i_block_group * 
+               EXT2_BLOCKS_PER_GROUP(inode->i_sb)) +
+              le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_first_data_block);
+}
 
-       if (!goal) {
-               for (tmp = nr - 1; tmp >= 0; tmp--) {
-                       if (inode->u.ext2_i.i_data[tmp]) {
-                               goal = le32_to_cpu(inode->u.ext2_i.i_data[tmp]);
-                               break;
-                       }
-               }
-               if (!goal)
-                       goal = (inode->u.ext2_i.i_block_group * 
-                               EXT2_BLOCKS_PER_GROUP(inode->i_sb)) +
-                              le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_first_data_block);
-       }
+/**
+ *     ext2_find_goal - find a prefered place for allocation.
+ *     @inode: owner
+ *     @block:  block we want
+ *     @chain:  chain of indirect blocks
+ *     @partial: pointer to the last triple within a chain
+ *     @goal:  place to store the result.
+ *
+ *     Normally this function find the prefered place for block allocation,
+ *     stores it in *@goal and returns zero. If the branch had been changed
+ *     under us we return -EAGAIN.
+ */
 
-       ext2_debug ("goal = %d.\n", goal);
-
-       tmp = ext2_alloc_block (inode, goal, err);
-       if (!tmp)
-               return NULL;
-
-       if (metadata) {
-               result = getblk (inode->i_dev, tmp, blocksize);
-               if (!buffer_uptodate(result))
-                       wait_on_buffer(result);
-               memset(result->b_data, 0, blocksize);
-               mark_buffer_uptodate(result, 1);
-               mark_buffer_dirty(result, 1);
-               if (*p) {
-                       ext2_free_blocks (inode, tmp, 1);
-                       bforget (result);
-                       goto repeat;
-               }
-       } else {
-               if (*p) {
-                       /*
-                        * Nobody is allowed to change block allocation
-                        * state from under us:
-                        */
-                       ext2_error (inode->i_sb, "block_getblk",
-                                   "data block filled under us");
-                       BUG();
-                       ext2_free_blocks (inode, tmp, 1);
-                       goto repeat;
-               }
-               *phys = tmp;
-               result = NULL;
-               *err = 0;
-               *new = 1;
+static inline int ext2_find_goal(struct inode *inode,
+                                long block,
+                                Indirect chain[4],
+                                Indirect *partial,
+                                unsigned long *goal)
+{
+       /* Writer: ->i_next_alloc* */
+       if (block == inode->u.ext2_i.i_next_alloc_block + 1) {
+               inode->u.ext2_i.i_next_alloc_block++;
+               inode->u.ext2_i.i_next_alloc_goal++;
+       } 
+       /* Writer: end */
+       /* Reader: pointers, ->i_next_alloc* */
+       if (verify_chain(chain, partial)) {
+               /*
+                * try the heuristic for sequential allocation,
+                * failing that at least try to get decent locality.
+                */
+               if (block == inode->u.ext2_i.i_next_alloc_block)
+                       *goal = inode->u.ext2_i.i_next_alloc_goal;
+               if (!*goal)
+                       *goal = ext2_find_near(inode, partial);
+               return 0;
        }
-       *p = cpu_to_le32(tmp);
-
-       inode->u.ext2_i.i_next_alloc_block = new_block;
-       inode->u.ext2_i.i_next_alloc_goal = tmp;
-       inode->i_ctime = CURRENT_TIME;
-       inode->i_blocks += blocksize/512;
-       if (IS_SYNC(inode) || inode->u.ext2_i.i_osync)
-               ext2_sync_inode (inode);
-       else
-               mark_inode_dirty(inode);
-       return result;
+       /* Reader: end */
+       return -EAGAIN;
 }
 
-/*
- *   metadata / data
- *   possibly create / access
- *   can fail due to: - not present
- *                    - out of space
+/**
+ *     ext2_alloc_branch - allocate and set up a chain of blocks.
+ *     @inode: owner
+ *     @num: depth of the chain (number of blocks to allocate)
+ *     @offsets: offsets (in the blocks) to store the pointers to next.
+ *     @branch: place to store the chain in.
+ *
+ *     This function allocates @num blocks, zeroes out all but the last one,
+ *     links them into chain and (if we are synchronous) writes them to disk.
+ *     In other words, it prepares a branch that can be spliced onto the
+ *     inode. It stores the information about that chain in the branch[], in
+ *     the same format as ext2_get_branch() would do. We are calling it after
+ *     we had read the existing part of chain and partial points to the last
+ *     triple of that (one with zero ->key). Upon the exit we have the same
+ *     picture as after the successful ext2_get_block(), excpet that in one
+ *     place chain is disconnected - *branch->p is still zero (we did not
+ *     set the last link), but branch->key contains the number that should
+ *     be placed into *branch->p to fill that gap.
  *
- *   NULL return in the data case is mandatory.
+ *     If allocation fails we free all blocks we've allocated (and forget
+ *     ther buffer_heads) and return the error value the from failed
+ *     ext2_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *     as described above and return 0.
  */
-static struct buffer_head * block_getblk (struct inode * inode,
-         struct buffer_head * bh, int nr,
-         int new_block, int * err, int metadata, long *phys, int *new)
+
+static int ext2_alloc_branch(struct inode *inode,
+                            int num,
+                            unsigned long goal,
+                            int *offsets,
+                            Indirect *branch)
 {
-       int tmp, goal = 0;
-       u32 * p;
-       struct buffer_head * result;
        int blocksize = inode->i_sb->s_blocksize;
+       int n = 0;
+       int err;
+       int i;
+       int parent = ext2_alloc_block(inode, goal, &err);
 
-       result = NULL;  
-       if (!bh)
-               goto out;
-       if (!buffer_uptodate(bh)) {
-               ll_rw_block (READ, 1, &bh);
-               wait_on_buffer (bh);
+       branch[0].key = cpu_to_le32(parent);
+       if (parent) for (n = 1; n < num; n++) {
+               struct buffer_head *bh;
+               /* Allocate the next block */
+               int nr = ext2_alloc_block(inode, parent, &err);
+               if (!nr)
+                       break;
+               branch[n].key = cpu_to_le32(nr);
+               /*
+                * Get buffer_head for parent block, zero it out and set 
+                * the pointer to new one, then send parent to disk.
+                */
+               bh = getblk(inode->i_dev, parent, blocksize);
                if (!buffer_uptodate(bh))
-                       goto out;
-       }
-       p = (u32 *) bh->b_data + nr;
-repeat:
-       tmp = le32_to_cpu(*p);
-       if (tmp) {
-               if (metadata) {
-                       result = getblk (bh->b_dev, tmp, blocksize);
-                       if (tmp == le32_to_cpu(*p))
-                               goto out;
-                       brelse (result);
-                       goto repeat;
-               } else {
-                       *phys = tmp;
-                       /* result == NULL */
-                       goto out;
+                       wait_on_buffer(bh);
+               memset(bh->b_data, 0, blocksize);
+               branch[n].bh = bh;
+               branch[n].p = (u32*) bh->b_data + offsets[n];
+               *branch[n].p = branch[n].key;
+               mark_buffer_uptodate(bh, 1);
+               mark_buffer_dirty(bh, 1);
+               if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) {
+                       ll_rw_block (WRITE, 1, &bh);
+                       wait_on_buffer (bh);
                }
+               parent = nr;
        }
+       if (n == num)
+               return 0;
 
-       if (inode->u.ext2_i.i_next_alloc_block == new_block)
-               goal = inode->u.ext2_i.i_next_alloc_goal;
-       if (!goal) {
-               for (tmp = nr - 1; tmp >= 0; tmp--) {
-                       if (le32_to_cpu(((u32 *) bh->b_data)[tmp])) {
-                               goal = le32_to_cpu(((u32 *)bh->b_data)[tmp]);
-                               break;
-                       }
-               }
-               if (!goal)
-                       goal = bh->b_blocknr;
-       }
-       tmp = ext2_alloc_block (inode, goal, err);
-       if (!tmp)
-               goto out;
-       if (metadata) {
-               result = getblk (bh->b_dev, tmp, blocksize);
-               if (!buffer_uptodate(result))
-                       wait_on_buffer(result);
-               memset(result->b_data, 0, inode->i_sb->s_blocksize);
-               mark_buffer_uptodate(result, 1);
-               mark_buffer_dirty(result, 1);
-               if (*p) {
-                       ext2_free_blocks (inode, tmp, 1);
-                       bforget (result);
-                       goto repeat;
-               }
-       } else {
-               if (*p) {
-                       /*
-                        * Nobody is allowed to change block allocation
-                        * state from under us:
-                        */
-                       ext2_error (inode->i_sb, "block_getblk",
-                                   "data block filled under us");
-                       BUG();
-                       ext2_free_blocks (inode, tmp, 1);
-                       goto repeat;
+       /* Allocation failed, free what we already allocated */
+       for (i = 1; i < n; i++)
+               bforget(branch[i].bh);
+       for (i = 0; i < n; i++)
+               ext2_free_blocks(inode, le32_to_cpu(branch[i].key), 1);
+       return err;
+}
+
+/**
+ *     ext2_splice_branch - splice the allocated branch onto inode.
+ *     @inode: owner
+ *     @block: (logical) number of block we are adding
+ *     @chain: chain of indirect blocks (with a missing link - see
+ *             ext2_alloc_branch)
+ *     @where: location of missing link
+ *     @num:   number of blocks we are adding
+ *
+ *     This function verifies that chain (up to the missing link) had not
+ *     changed, fills the missing link and does all housekeeping needed in
+ *     inode (->i_blocks, etc.). In case of success we end up with the full
+ *     chain to new block and return 0. Otherwise (== chain had been changed)
+ *     we free the new blocks (forgetting their buffer_heads, indeed) and
+ *     return -EAGAIN.
+ */
+
+static inline int ext2_splice_branch(struct inode *inode,
+                                    long block,
+                                    Indirect chain[4],
+                                    Indirect *where,
+                                    int num)
+{
+       int i;
+
+       /* Verify that place we are splicing to is still there and vacant */
+
+       /* Writer: pointers, ->i_next_alloc*, ->i_blocks */
+       if (!verify_chain(chain, where-1) || *where->p)
+               /* Writer: end */
+               goto changed;
+
+       /* That's it */
+
+       *where->p = where->key;
+       inode->u.ext2_i.i_next_alloc_block = block;
+       inode->u.ext2_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
+       inode->i_blocks += num * inode->i_sb->s_blocksize/512;
+
+       /* Writer: end */
+
+       /* We are done with atomic stuff, now do the rest of housekeeping */
+
+       inode->i_ctime = CURRENT_TIME;
+
+       /* had we spliced it onto indirect block? */
+       if (where->bh) {
+               mark_buffer_dirty(where->bh, 1);
+               if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) {
+                       ll_rw_block (WRITE, 1, &where->bh);
+                       wait_on_buffer(where->bh);
                }
-               *phys = tmp;
-               *new = 1;
-       }
-       *p = le32_to_cpu(tmp);
-       mark_buffer_dirty(bh, 1);
-       if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) {
-               ll_rw_block (WRITE, 1, &bh);
-               wait_on_buffer (bh);
        }
-       inode->i_ctime = CURRENT_TIME;
-       inode->i_blocks += blocksize/512;
-       mark_inode_dirty(inode);
-       inode->u.ext2_i.i_next_alloc_block = new_block;
-       inode->u.ext2_i.i_next_alloc_goal = tmp;
-       *err = 0;
-out:
-       brelse (bh);
-       return result;
+
+       if (IS_SYNC(inode) || inode->u.ext2_i.i_osync)
+               ext2_sync_inode (inode);
+       else
+               mark_inode_dirty(inode);
+       return 0;
+
+changed:
+       for (i = 1; i < num; i++)
+               bforget(where[i].bh);
+       for (i = 0; i < num; i++)
+               ext2_free_blocks(inode, le32_to_cpu(where[i].key), 1);
+       return -EAGAIN;
 }
 
+/*
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ */
+
 static int ext2_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create)
 {
-       int ret, err, new;
-       struct buffer_head *bh;
-       unsigned long phys;
+       int err = -EIO;
        int offsets[4];
-       int *p;
        Indirect chain[4];
        Indirect *partial;
-       int depth;
+       unsigned long goal;
+       int left;
+       int depth = ext2_block_to_path(inode, iblock, offsets);
 
-       depth = ext2_block_to_path(inode, iblock, offsets);
        if (depth == 0)
-               goto abort;
+               goto out;
 
        lock_kernel();
+reread:
        partial = ext2_get_branch(inode, depth, offsets, chain, &err);
 
+       /* Simplest case - block found, no allocation needed */
        if (!partial) {
-               unlock_kernel();
-               for (partial = chain + depth - 1; partial > chain; partial--)
-                       brelse(partial->bh);
+got_it:
                bh_result->b_dev = inode->i_dev;
                bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
                bh_result->b_state |= (1UL << BH_Mapped);
-               return 0;
+               /* Clean up and exit */
+               partial = chain+depth-1; /* the whole chain */
+               goto cleanup;
        }
 
-       while (partial > chain) {
-               brelse(partial->bh);
-               partial--;
-       }
-
-       if (!create) {
+       /* Next simple case - plain lookup or failed read of indirect block */
+       if (!create || err == -EIO) {
+cleanup:
+               while (partial > chain) {
+                       brelse(partial->bh);
+                       partial--;
+               }
                unlock_kernel();
-               return 0;
+out:
+               return err;
        }
 
-       err = -EIO;
-       new = 0;
-       ret = 0;
-       bh = NULL;
-
        /*
-        * If this is a sequential block allocation, set the next_alloc_block
-        * to this block now so that all the indblock and data block
-        * allocations use the same goal zone
+        * Indirect block might be removed by truncate while we were
+        * reading it. Handling of that case (forget what we've got and
+        * reread) is taken out of the main path.
         */
+       if (err == -EAGAIN)
+               goto changed;
 
-       ext2_debug ("block %lu, next %lu, goal %lu.\n", iblock, 
-                   inode->u.ext2_i.i_next_alloc_block,
-                   inode->u.ext2_i.i_next_alloc_goal);
+       if (ext2_find_goal(inode, iblock, chain, partial, &goal) < 0)
+               goto changed;
 
-       if (iblock == inode->u.ext2_i.i_next_alloc_block + 1) {
-               inode->u.ext2_i.i_next_alloc_block++;
-               inode->u.ext2_i.i_next_alloc_goal++;
-       }
+       left = (chain + depth) - partial;
+       err = ext2_alloc_branch(inode, left, goal,
+                                       offsets+(partial-chain), partial);
+       if (err)
+               goto cleanup;
 
-       err = 0;
+       if (ext2_splice_branch(inode, iblock, chain, partial, left) < 0)
+               goto changed;
 
-       /*
-        * ok, these macros clean the logic up a bit and make
-        * it much more readable:
-        */
-#define GET_INODE_DATABLOCK(x) \
-               inode_getblk(inode, x, iblock, &err, 0, &phys, &new)
-#define GET_INODE_PTR(x) \
-               inode_getblk(inode, x, iblock, &err, 1, NULL, NULL)
-#define GET_INDIRECT_DATABLOCK(x) \
-               block_getblk (inode, bh, x, iblock, &err, 0, &phys, &new);
-#define GET_INDIRECT_PTR(x) \
-               block_getblk (inode, bh, x, iblock, &err, 1, NULL, NULL);
-
-       p = offsets;
-       if (depth == 1) {
-               bh = GET_INODE_DATABLOCK(*p);
-               goto out;
-       }
-       bh = GET_INODE_PTR(*p);
-       switch (depth) {
-               default: /* case 4: */
-                       bh = GET_INDIRECT_PTR(*++p);
-               case 3:
-                       bh = GET_INDIRECT_PTR(*++p);
-               case 2:
-                       bh = GET_INDIRECT_DATABLOCK(*++p);
-       }
-
-#undef GET_INODE_DATABLOCK
-#undef GET_INODE_PTR
-#undef GET_INDIRECT_DATABLOCK
-#undef GET_INDIRECT_PTR
+       bh_result->b_state |= (1UL << BH_New);
+       goto got_it;
 
-out:
-       if (bh)
-               BUG();  // temporary debugging check
-       if (err)
-               goto abort;
-       if (!phys)
-               BUG();  // must not happen either
-
-       bh_result->b_dev = inode->i_dev;
-       bh_result->b_blocknr = phys;
-       bh_result->b_state |= (1UL << BH_Mapped); /* safe */
-       if (new)
-               bh_result->b_state |= (1UL << BH_New);
-       unlock_kernel();
-abort:
-       return err;
+changed:
+       while (partial > chain) {
+               bforget(partial->bh);
+               partial--;
+       }
+       goto reread;
 }
 
 struct buffer_head * ext2_getblk(struct inode * inode, long block, int create, int * err)