From 98a3163be76264e7dcb02a51c3d99b613533a3aa Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 26 Jun 2010 11:16:10 +1000 Subject: [PATCH] Change flushing of space-accounting blocks. Space-accounting blocks need to be flushed very late in the checkpoint. We were special casing these, but in an awkward way. Change it so that these blocks are pinned, but that a checkpoint doesn't handle them straight away but rather performs a phase_flip and then queues them for later handling. This means that we get more consistent behviour of pinned data blocks and writepage doesn't need to special-base the flushing of segment usage blocks. Signed-off-by: NeilBrown --- checkpoint.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- file.c | 4 ---- index.c | 39 +++++++++++++++++++++++++++++++++------ lafs.h | 2 +- quota.c | 5 ----- segments.c | 16 ---------------- state.h | 11 ++++++++--- super.c | 1 + 8 files changed, 86 insertions(+), 42 deletions(-) diff --git a/checkpoint.c b/checkpoint.c index ef49f2b..79b9de7 100644 --- a/checkpoint.c +++ b/checkpoint.c @@ -223,6 +223,13 @@ int lafs_print_tree(struct block *b, int depth) } j++; } + list_for_each_entry(b2, &dfs->account_leafs, lru) { + if (b2 == b) { + printk(" Account(%d) ", j); + break; + } + j++; + } list_for_each_entry(b2, &freelist.lru, lru) if (b2 == b) { printk(" on free "); @@ -380,7 +387,24 @@ again: LAFS_BUG(!!test_bit(B_Phase1, &b->flags) != oldphase, b); - if (test_bit(B_Index, &b->flags) && + if (test_bit(B_PinPending, &b->flags) && + !test_bit(B_Index, &b->flags) && + (LAFSI(b->inode)->type == TypeSegmentMap || + LAFSI(b->inode)->type == TypeQuota)) { + /* Need to delay handling of this block until + * the phase change has finished, to just + * before we finish the checkpoint. + * Note that we don't check if they are dirty - + * they might not be yet - the whole point of the + * delay is that changes are still arriving. + */ + lafs_flip_dblock(dblk(b)); + /* access to account_leafs is single-threaded + * by the cleaner thread so no locking needed + */ + getref(b, MKREF(accounting)); + list_add(&b->lru, &fs->account_leafs); + } else if (test_bit(B_Index, &b->flags) && (iblk(b)->uninc_table.pending_cnt || iblk(b)->uninc)) { lafs_incorporate(fs, iblk(b)); @@ -433,14 +457,23 @@ again: lafs_clusters_done(fs); } +static void flush_accounting(struct fs *fs) +{ + while (!list_empty(&fs->account_leafs)) { + struct block *b = list_first_entry(&fs->account_leafs, + struct block, + lru); + list_del_init(&b->lru); + lafs_iolock_block(b); + lafs_cluster_allocate(b, 0); + putref(b, MKREF(accounting)); + } + fs->qphase = fs->phase; +} + static void finish_checkpoint(struct fs *fs, int youth) { - set_bit(CheckpointFlushing, &fs->fsstate); - lafs_seg_flush_all(fs); - lafs_quota_flush(fs); - clear_bit(CheckpointFlushing, &fs->fsstate); - if (!test_bit(FinalCheckpoint, &fs->fsstate)) - lafs_seg_apply_all(fs); + flush_accounting(fs); /* if we are creating a snapshot, special handling is needed */ if (LAFSI(fs->ss[0].root)->md.fs.usagetable > 1) { @@ -454,6 +487,9 @@ static void finish_checkpoint(struct fs *fs, int youth) dprintk("FinalFlush %d\n", fs->seq); lafs_cluster_flush(fs, 0); + if (!test_bit(FinalCheckpoint, &fs->fsstate)) + lafs_seg_apply_all(fs); + lafs_write_state(fs); dprintk("State written, all done %d\n", fs->seq); diff --git a/file.c b/file.c index 83ded7a..277c02c 100644 --- a/file.c +++ b/file.c @@ -212,10 +212,6 @@ lafs_writepage(struct page *page, struct writeback_control *wbc) if (test_bit(B_Dirty, &b->b.flags)) { if (test_bit(B_PinPending, &b->b.flags)) redirty = 1; - else if (LAFSI(ino)->type == TypeSegmentMap && - !test_bit(CheckpointFlushing, &fs->fsstate)) - /* FIXME or quota ?? */ - redirty = 1; else if (LAFSI(b->b.inode)->type == TypeInodeFile && b->my_inode && LAFSI(b->my_inode)->iblock) diff --git a/index.c b/index.c index 2870d67..5884e79 100644 --- a/index.c +++ b/index.c @@ -455,7 +455,8 @@ static void flip_phase(struct block *b) { struct indexblock *p; int oldphase = !!test_bit(B_Phase1, &b->flags); - + + LAFS_BUG(!test_bit(B_Pinned, &b->flags), b); if (oldphase) clear_bit(B_Phase1, &b->flags); else @@ -479,20 +480,46 @@ static void flip_phase(struct block *b) set_bit(B_ICredit, &b->flags); } +/* When the pinning of a block needs to be carried across a + * checkpoint, we need to 'flip' the phase. + * This only applies to blocks that can be pinned by a block that + * may not be written until the next phase. + * This includes any index block (is it may have some children in one + * phase and some in the next) and any space-accounting block + * for the same reason. + * So indexblocks will need to flip, and use lafs_phase_flip. + * TypeSegmentMap and TypeQuota also need to flip and use lafs_flip_dblock. + * TypeInodeFile don't need to be phase_flipped, though their InoIdx + * block might. InodeFile blocks are only pinned by metadata transactions + * which happen inside a checkpoint lock. + */ + +void lafs_flip_dblock(struct datablock *db) +{ + /* This is an accounting block (SegmentMap or Quota) + * which we need to write out after ->phase has changed + * in the tail of the checkpoint. + * We always flip the phase and reallocate from AccountSpace. + * If all references get dropped, it will then get unpinned + * before the next phase finished - we don't unpin here + * (unlike lafs_phase_flip for index blocks). + */ + flip_phase(&db->b); + lafs_prealloc(&db->b, AccountSpace); + /* Parent might need to be on a leaflist now */ + lafs_refile(&db->b.parent->b, 0); +} + void lafs_phase_flip(struct fs *fs, struct indexblock *ib) { /* We are performing a checkpoint, this block has been written * out and now needs to be flipped into the next phase. - * This only makes sense for an index block. Datablocks are - * simply unpinned at phase change. + * * It involves. * - Processing all uninc_next blocks into uninc_table. * - adjusting counts on parent * - moving credits from 'next' to 'this' phase. * - update block counts to included phase-delayed updates. - *NO: This is done when 'writing' the InoIdx. - * For InoIdx, we transfer the pinning and Credits to the - * Data block rather than release them. */ int oldphase = !!test_bit(B_Phase1, &ib->b.flags); struct block *ulist; diff --git a/lafs.h b/lafs.h index 75dcfef..bd937a3 100644 --- a/lafs.h +++ b/lafs.h @@ -655,6 +655,7 @@ static inline void lafs_pin_block(struct block *b) } int lafs_add_block_address(struct fs *fs, struct block *blk); +void lafs_flip_dblock(struct datablock *db); void lafs_phase_flip(struct fs *fs, struct indexblock *ib); struct indexblock * __must_check lafs_make_iblock(struct inode *ino, int adopt, int async, REFARG); @@ -667,7 +668,6 @@ void lafs_write_head(struct fs *fs, struct cluster_head *head, u64 virt, void lafs_write_block(struct fs *fs, struct block *b, int dev, struct wc *wc); /* quota.c */ -void lafs_quota_flush(struct fs *fs); int lafs_quota_allocate(struct fs *fs, struct inode *ino, int diff); #define __wait_event_lock(wq, condition, lock) \ diff --git a/quota.c b/quota.c index 5f74f06..478423f 100644 --- a/quota.c +++ b/quota.c @@ -5,11 +5,6 @@ void lafs_qcommit(struct fs *fs, struct inode *ino, int diff, int phase) { } -void lafs_quota_flush(struct fs *fs) -{ - fs->qphase = fs->phase; -} - int lafs_quota_allocate(struct fs *fs, struct inode *ino, int diff) { return 0; diff --git a/segments.c b/segments.c index d698cc8..ada9ea6 100644 --- a/segments.c +++ b/segments.c @@ -393,22 +393,6 @@ void lafs_seg_move(struct fs *fs, u64 oldaddr, u64 newaddr, } } -/* lafs_seg_flush_all - * All segment usage tables should be flushed to storage. - * This is called towards the end of performing a checkpoint, after - * the entire phase tree has been committed. The blocks written - * here record the status of the finishing phase, but they themselves - * become part of the next phase. They can be found during roll-forward - * as their write-clusters are still flagged as being part of a checkpoint. - */ -void lafs_seg_flush_all(struct fs *fs) -{ - int d; - for (d = 0; d < fs->devices ; d++) - write_inode_now(fs->devs[d].segsum, 0); - for (d = 0; d < fs->devices ; d++) - write_inode_now(fs->devs[d].segsum, 1); -} static void seg_apply(struct fs *fs, struct segsum *ss) { diff --git a/state.h b/state.h index ad80ff9..fc32c29 100644 --- a/state.h +++ b/state.h @@ -89,8 +89,7 @@ struct fs { #define FinalCheckpoint 3 #define CleanerDisabled 4 #define OrphansRunning 5 -#define CheckpointFlushing 6 /* We are writing the segusage blocks */ -#define CleanerBlocks 7 /* One or more threads is blocked waiting for the +#define CleanerBlocks 6 /* One or more threads is blocked waiting for the * cleaner to progress - cleaner.need blocks are * needed. */ @@ -160,6 +159,12 @@ struct fs { * have no pinned children * and are being cleaned */ + struct list_head account_leafs; /* list of accounting block + * that we need to write after + * the checkpoint is done. + * They are now pinned to the + * next phase. + */ /* Youth management */ int youth_next; /* number to assign to next segment */ @@ -328,7 +333,7 @@ struct block { * reachability-set as this block */ - struct list_head lru; /* phase_leafs, clean_leafs, + struct list_head lru; /* phase_leafs, clean_leafs, account_leafs, * clhead, pending_blocks */ struct list_head peers; /* other blocks that use the same location diff --git a/super.c b/super.c index 5bc8cee..0819f83 100644 --- a/super.c +++ b/super.c @@ -527,6 +527,7 @@ lafs_load(struct options *op, int newest) INIT_LIST_HEAD(&fs->phase_leafs[0]); INIT_LIST_HEAD(&fs->phase_leafs[1]); INIT_LIST_HEAD(&fs->clean_leafs); + INIT_LIST_HEAD(&fs->account_leafs); atomic_set(&fs->sb_writes_pending, 0); init_waitqueue_head(&fs->sb_writes_wait); init_waitqueue_head(&fs->async_complete); -- 2.39.5