From: NeilBrown Date: Sat, 15 Aug 2009 07:09:22 +0000 (+1000) Subject: Simplify writeout rules for inode data block. X-Git-Url: http://git.neil.brown.name/?a=commitdiff_plain;h=b39702aed7b9428bd179a865214396ab37f7ee59;p=LaFS.git Simplify writeout rules for inode data block. Previously we did not write an inode data block until the InoIdx was ready. This is not good if we need to sync an inode well before checkpoint runs. So just write an inde data block when we find it, but ensure not to send an inode data block during checkpoint until the InoIdx block is ready. Note: it is now clear why an inode has two sets of credits, one on the data block and one on the index block. The first set may be needed to sync the inode metadata. The second may be need to update the indexing information - they are copied across to the data block for this purpose. --- diff --git a/checkpoint.c b/checkpoint.c index 686d2cc..e0d5e93 100644 --- a/checkpoint.c +++ b/checkpoint.c @@ -294,6 +294,8 @@ static int prepare_checkpoint(struct fs *fs) struct block *lafs_get_flushable(struct fs *fs, int phase) { struct block *b; + struct inode *in; + struct lafs_inode *lai; retry: spin_lock(&fs->lock); @@ -303,6 +305,31 @@ struct block *lafs_get_flushable(struct fs *fs, int phase) b = list_entry(fs->phase_leafs[phase].next, struct block, lru); else b = NULL; + + /* If this is an inode data block and the InoIdx block is + * also flushable (or will be soon), then de-pin this block. + * The InoIdx will hold and eventually pin it. + */ + if (b && !test_bit(B_Index, &b->flags) && + LAFSI(b->inode)->type == TypeInodeFile && + (in = dblk(b)->my_inode) && + (lai = LAFSI(in)) && + lai->iblock != NULL && + test_bit(B_Pinned, &lai->iblock->b.flags) && + (phase == !!test_bit(B_Phase1, &lai->iblock->b.flags) || + (phase == -1 && test_bit(B_Realloc, &lai->iblock->b.flags))) + ) { + /* Don't really want this to be pinned yet. When the + * InoIdx block gets proceed we will get pinned again. + */ + list_del_init(&b->lru); + clear_bit(B_Pinned, &b->flags); + if (!test_bit(B_Root, &b->flags)) + atomic_dec(&b->parent->pincnt[!!test_bit(B_Phase1, &b->flags)]); + spin_unlock(&fs->lock); + putref(b, MKREF(leaf)); + goto retry; + } if (b) /* the list counted a reference. Now we hold it */ list_del_init(&b->lru); @@ -349,13 +376,6 @@ static void do_checkpoint(void *data) test_bit(B_Realloc, &b->flags))) { lafs_cluster_allocate(b, 0); unlock = 0; - } else if (test_bit(B_InoIdx, &b->flags) && - (test_bit(B_Dirty, &LAFSI(b->inode)->dblock->b.flags) || - test_bit(B_Realloc, &LAFSI(b->inode)->dblock->b.flags)) && - !test_bit(B_Pinned, &LAFSI(b->inode)->dblock->b.flags)) { - /* inode data block is dirty/realloc, so allocate it */ - lafs_cluster_allocate(b, 0); - unlock = 0; } else if (test_bit(B_Index, &b->flags)) { if (atomic_read(&iblk(b)->pincnt[oldphase]) == 0) { diff --git a/cluster.c b/cluster.c index 7dd6aa8..bc80310 100644 --- a/cluster.c +++ b/cluster.c @@ -511,7 +511,6 @@ unsigned long long lafs_cluster_allocate(struct block *b, int cnum) struct wc *wc = &fs->wc[cnum]; struct lafs_inode *lai; loff_t size; - struct indexblock *inob; int used; @@ -553,48 +552,26 @@ unsigned long long lafs_cluster_allocate(struct block *b, int cnum) /* The symbiosis between the datablock and the indexblock for an * inode needs to be dealt with here. - * We don't want to write the datablock until the indexblock is - * ready (there may be incorporation still to happen) - * But when it is ready, we want to write the datablock, not the - * indexblock (as the datablock knows where the address needs to - * be incorporated). - * So if this is the datablock we flush the inode if appropriate - */ - - /* FIXME do I need any locking here? I don't think so - * but it is hard to be sure + * The data block will not normally be written until the InoIdx + * block is unpinned or phase-flipped. However if it is, as the + * result of some 'sync' style operation, then we write it even + * though the index info might not be uptodate. The rest of the + * content will be safe for roll-forward to pick up, and the rest + * will be written when the InoIdx block is ready. + * + * When the InoIdx block is ready, we don't want to write it as + * there is nowhere for it to be incorporated in to. Rather we + * move the pinning and credits across to the Data block + * which will subsequently be written. */ - if (!test_bit(B_Index, &b->flags) && - lai->type == TypeInodeFile && - dblk(b)->my_inode != NULL && - !test_bit(B_Pinned, &b->flags) /* Once the data block is pinned, - we write it */ - ) { - spin_lock(&dblk(b)->my_inode->i_data.private_lock); - inob = LAFSI(dblk(b)->my_inode)->iblock; - if (inob != NULL && - test_bit(B_Pinned, &inob->b.flags) && - test_bit(B_Phase1, &inob->b.flags) == - test_bit(B_Phase1, &b->flags) - ) { - /* Don't allocate yet, until index block is ready */ - spin_unlock(&dblk(b)->my_inode->i_data.private_lock); - lafs_iounlock_block(b, 0); - return 0; - } - spin_unlock(&dblk(b)->my_inode->i_data.private_lock); - } - /* and if this is the InoIdx block, we make sure the datablock - * is ready to be written. - */ if (test_bit(B_InoIdx, &b->flags)) { struct block *b2; int credits = 0; /* FIXME should I inode_fillblock here?? */ - if (!test_bit(B_Pinned, &lai->dblock->b.flags) && - test_bit(B_Pinned, &b->flags)) { + BUG_ON(!test_bit(B_Pinned, &b->flags)); + if (!test_bit(B_Pinned, &lai->dblock->b.flags)) { /* index block pinned, data block not, * carry the pinning across */ diff --git a/index.c b/index.c index bbbf405..260a6f8 100644 --- a/index.c +++ b/index.c @@ -753,12 +753,8 @@ void lafs_refile(struct block *b, int dec) iblk(b)->uninc_next == NULL && atomic_read(&b->refcnt) == dec+onlru && atomic_read(&iblk(b)->pincnt[0]) == 0 && - atomic_read(&iblk(b)->pincnt[1]) == 0)) && - (!test_bit(B_InoIdx, &b->flags) || - !(test_bit(B_PinPending, &LAFSI(b->inode)->dblock->b.flags) - || test_bit(B_Dirty, &LAFSI(b->inode)->dblock->b.flags) - || test_bit(B_Realloc, &LAFSI(b->inode)->dblock->b.flags)) - )) { + atomic_read(&iblk(b)->pincnt[1]) == 0)) + ) { /* Don't need to be Pinned any more */ fs = fs_from_inode(b->inode); lafs_checkpoint_lock(fs);