]> git.neil.brown.name Git - LaFS.git/commitdiff
Avoid races when processing blocks for checkpoint.
authorNeilBrown <neilb@suse.de>
Sat, 15 Aug 2009 07:03:19 +0000 (17:03 +1000)
committerNeilBrown <neilb@suse.de>
Sat, 15 Aug 2009 07:03:19 +0000 (17:03 +1000)
When doing a checkpoint we need to be sure that every
flushable block is accounted for.  So we need to be able
to wait for anything outstanding.

Currently a block can be removed from the leaflist by writepage
before it is placed on the cluster list.  During this window
the checkpoint thread cannot see it and so might progress without
waiting for and so will think it has completed prematurely.

So delay the removal from the leaflist until after we have the
writecluster lock.  This assures that every leaf block will be
either on the leaf list or on the cluster list when checkpoint is
looking for it.

When checkpoint calls cluster_done, this will release any blocks
from the cluster and, if needed, put them back on the leaf list where
they can be found again.

cluster.c

index 35c4db6928397d06ad534a79e4e1ada33407357c..ea52e2e707da59fa0c33b624e4ee9cf4673795eb 100644 (file)
--- a/cluster.c
+++ b/cluster.c
@@ -518,24 +518,6 @@ unsigned long long lafs_cluster_allocate(struct block *b, int cnum)
        BUG_ON(!test_bit(B_IOLock, &b->flags));
        lai = LAFSI(b->inode);
 
-       if (!list_empty_careful(&b->lru)) {
-               /* Someone is flushing this block before
-                * checkpoint got to it - probably writepage.
-                * Must remove it from the leaf lru
-                */
-               int onlru = 1;
-               BUG_ON(test_bit(B_OnFree, &b->flags));
-               spin_lock(&fs->lock);
-               if (list_empty(&b->lru))
-                       onlru = 0;
-               else
-                       list_del_init(&b->lru);
-               spin_unlock(&fs->lock);
-               if (onlru)
-                       putref(b, MKREF(leaf));
-       }
-       BUG_ON(!list_empty(&b->lru));
-
        if (!test_bit(B_Valid, &b->flags)) {
                lafs_iounlock_block(b, B_IOLock);
                return wc->cluster_seq;
@@ -689,6 +671,30 @@ unsigned long long lafs_cluster_allocate(struct block *b, int cnum)
        BUG_ON(!test_bit(B_Valid, &b->flags));
 
        mutex_lock(&wc->lock);
+
+       /* The following check must be inside the mutex_lock
+        * to ensure exclusion between checkpoint and writepage.
+        * checkpoint must never see the block not on the
+        * leaf lru unless it is already in the cluster and so can
+        * be waited for.
+        */
+       if (!list_empty_careful(&b->lru)) {
+               /* Someone is flushing this block before
+                * checkpoint got to it - probably writepage.
+                * Must remove it from the leaf lru
+                */
+               int onlru = 1;
+               BUG_ON(test_bit(B_OnFree, &b->flags));
+               spin_lock(&fs->lock);
+               if (list_empty(&b->lru))
+                       onlru = 0;
+               else
+                       list_del_init(&b->lru);
+               spin_unlock(&fs->lock);
+               if (onlru)
+                       putref(b, MKREF(leaf));
+       }
+
        while (wc->remaining < 1) {
                // printk("Call flush - remaining = %d\n", wc->remaining);
                if (wc->slhead.b == NULL)
@@ -1206,7 +1212,6 @@ static void cluster_flush(struct fs *fs, int cnum)
        while (!list_empty(&wc->clhead)) {
                int credits = 0;
                b = list_entry(wc->clhead.next, struct block, lru);
-               list_del_init(&b->lru);
                if (test_bit(B_Phase1, &b->flags))
                        set_bit(B_WritePhase1, &b->flags);
                else
@@ -1238,7 +1243,7 @@ static void cluster_flush(struct fs *fs, int cnum)
                        }
                }
                spin_lock(&fs->lock);
-               BUG_ON(!list_empty(&b->lru));
+               list_del(&b->lru);
                list_add(&b->lru, &wc->pending_blocks[(wc->pending_next+3)%4]);
                spin_unlock(&fs->lock);
                lafs_write_block(fs, b, segend.dev, wc);