From 3ba20a369b1ff77f1daffa0477258706f44ca9e4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 25 Jun 2010 18:58:46 +1000 Subject: [PATCH] README update, typos, FIXME comments etc. No code. Signed-off-by: NeilBrown --- README | 614 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- block.c | 3 + clean.c | 5 + index.c | 2 +- inode.c | 5 +- modify.c | 14 +- roll.c | 3 + state.h | 2 +- super.c | 1 + 9 files changed, 634 insertions(+), 15 deletions(-) diff --git a/README b/README index 9096950..50ab3d0 100644 --- a/README +++ b/README @@ -4212,6 +4212,7 @@ super.c:655 Pinned block in lafs_release: 0/2 is Dirty with plenty of credits, so it is a child 0/16 is Dirty/Realloc, or once Async + Dirty, but not on a leaf list, not pinned segments.c:332 seg_deref with refcnt , 2 in lafs_seg_put_all @@ -4222,41 +4223,636 @@ segments.c:1028 modify.c:1708 lafs_incorporate on non-dirty/realloc block 328/0 Index(1). 1 in uninc_table - probably during truncate. + Either we add uninc while not dirty + Or we clear Dirty while uninc present + or there is a race between the two. + + Don't know: add a bugon + Bugon in get_flushable didn't fire. inode.c:843 children present in truncate after final incorp... 328/0. 64 children, no uninc list. Maybe we ran the orphans too early?? or invalidate_page isn't removing the children. - Might want print_tree here? + Might want print_tree here?- added that. + Answer: all the children are in Realloc on Clean_leafs + Maybe erase_page needs to disconnect from cleaner too?? inode.c:828 Orphan handling - uninc but not dirty: is Realloc (sometimes) Maybe like mod:1708 -block.c:67 +block.c:67 * delref 'primary' from modify.c:2063 in the q2 branch. nxt has PrimaryRef... Maybe move earlier, but that shouldn't make a diff. ditto at modify.c:2035 nxt is primary as was I, so drop mine. - -block.c:529 + Don't know - looks like sibling list got broken. + Tidied up a bit and added a print-tree. + v.interesting result. Lots of consecutive index blocks all holding primary-ref + on single primary - which is wrong. + 1/ When setting PrimaryRef, if next holds PrimaryRef, then must take reference + on self, as are being inserted into chain + 2/ When splitting, new block must be addressed as first block which cannot + fix, not first block which doesn't fit. Else incorping in reverse order + can make lots of tiny index blocks. + +block.c:529 * erase with index depth > 1. 0/328 in orphan handling. Still have 8 or 15 blocks registered! + Maybe caused by index block errors. Added some printks. -block.c:479 +block.c:479 * not enough credits to dirty block 2/0 in dir_delete_commit for unlink. 74/xxxx in unlink 16/1 in seg_inc/seg_move...allocated_block/cluster_flush -block.c:197 + - writepage wrote the page?? + - checkpoint wrote it and didn't replenish the credits? + +block.c:197 XX invalidated pages finds dirty block after EOF, after iolock_written 0/0 Dirty/Realloc in unmount - all Realloc! + Need to wait for cleaner etc to finish at unmount time. -NULL deref in 1b4 +NULL deref in 1b4 YY cleaner->cluster_flush->count_credits->lock?? + Trying to get a lock on an inode that has since been free?? + spin_lock(&dblk(b)->my_inode->i_data.private_lock); + -001001 +001001 YY generic_drop_inode -- extra iput?? in lafs_inode_checkpin from refile -6b6b6b invalidate_inode_buffers!! in kill. use-after-free +6b6b6b YY + invalidate_inode_buffers!! in kill. use-after-free 7fffff seginsert from scan_seg + MAX/number-elements confusion. Worked around for now. + + +18 June 2010 +After a couple of fixes: + 1 BUG: unable to handle kernel NULL pointer dereference at 000001b4 + 1 BUG: unable to handle kernel paging request at 00100104 + 5 BUG: unable to handle kernel paging request at 6b6b6bfb + 4 kernel BUG at /home/neilb/work/nfsbrick/fs/module/block.c:209! + 4 kernel BUG at /home/neilb/work/nfsbrick/fs/module/block.c:496! + 3 kernel BUG at /home/neilb/work/nfsbrick/fs/module/block.c:67! + 2 kernel BUG at /home/neilb/work/nfsbrick/fs/module/cluster.c:531! + 16 kernel BUG at /home/neilb/work/nfsbrick/fs/module/inode.c:601! + 1 kernel BUG at /home/neilb/work/nfsbrick/fs/module/inode.c:852! + Realloc blocks confusing truncate + 2 kernel BUG at /home/neilb/work/nfsbrick/fs/module/modify.c:118! + 1 kernel BUG at /home/neilb/work/nfsbrick/fs/module/modify.c:1699! + 7 kernel BUG at /home/neilb/work/nfsbrick/fs/module/segments.c:1033! + 19 kernel BUG at /home/neilb/work/nfsbrick/fs/module/super.c:655! + + +TODO: + - truncate gets confused by blocks being cleaned. + Need to flush cleaner, or just removed the blocks. + - when add PrimaryRef in middle of list, take the right ref. + - fix up wait-for-cleaner at unmount time. + +19 Jun 2010 + + 3 BUG: unable to handle kernel paging request at 6b6b6bfb. + 5 kernel BUG at /home/neilb/work/nfsbrick/fs/module/block.c:209! + 5 kernel BUG at /home/neilb/work/nfsbrick/fs/module/index.c:1890! + 22 kernel BUG at /home/neilb/work/nfsbrick/fs/module/inode.c:601! + 2 kernel BUG at /home/neilb/work/nfsbrick/fs/module/inode.c:835! + 3 kernel BUG at /home/neilb/work/nfsbrick/fs/module/inode.c:852! + 9 kernel BUG at /home/neilb/work/nfsbrick/fs/module/segments.c:1033! + 17 kernel BUG at /home/neilb/work/nfsbrick/fs/module/super.c:656! + 251 SysRq : Resetting + 3 SysRq : Show State + + - We can erase a dblock while it is in the uninc_pending or + uninc_next - need to be careful + - At umount, 0/2 is Dirty but not Pinned, so not written out + ditto from 0/16 + 16/0 sometimes is Async + 16/0 Async might be from the segment scan - so wait for that. + Dirty but not pinned can happen when InoIdx is pinned. + + - I think the uninc_next list (At least) should be sorted before + being allocated. + + - root block dirty/realloc/leaf in final iput + Could be it was changed during last checkpoint so + pushed in to next phase? But why Realloc? + Maybe still issue with losing inode data block. + +20 June 2010 Happy Birtyhday Dad!! + +420 runs. + 4 BUG: unable to handle kernel paging request at 6b6b6bfb. + 26 kernel BUG at /home/neilb/work/nfsbrick/fs/module/block.c:209! + 87 kernel BUG at /home/neilb/work/nfsbrick/fs/module/inode.c:601! + 1 kernel BUG at /home/neilb/work/nfsbrick/fs/module/inode.c:839!0 + 4 kernel BUG at /home/neilb/work/nfsbrick/fs/module/inode.c:856!9 + 1 kernel BUG at /home/neilb/work/nfsbrick/fs/module/modify.c:1719!3 + 12 kernel BUG at /home/neilb/work/nfsbrick/fs/module/segments.c:1033! + 2 kernel BUG at /home/neilb/work/nfsbrick/fs/module/super.c:656! + + Problems: + - inode in i_sb_list has been freed. + - block 0/0 is dirty/realloc/leaf after final iput + - not all blocks freed by truncate + - Index block with uninc is not dirty - not FIXED: more iolock in phase_flip + - still children when truncate should have finished. + all are Realloc + Maybe inode has become unhashed and we re-load it?? + it is invalid after all!! + - Index block not dirty when incorp - has uninc. ?? + - didn't wait for free segments + - Data 16/0 is dirty but not pinned after final checkpoint - FIXED + + +watch -d 'awk -f checkseg /tmp/log; echo ====== ; grep -h -E "(BUG|SysRq : )" /var/tmp/lafs-logs/log* | sort | uniq -c ; echo === ; ls /var/tmp/lafs-logs/log* | wc -l' + + + Unclear on dirtying index blocks. + We normally mark it dirty first, then add the address to the uninc list. + Note that this is the reverse of data blocks which are changed first, then + dirtied. So maybe we should mark dirty afterwards. We then need to + avoid incorporation while we are adding addresses else we might find it + has addresses but is not dirty. Only try if dirty? + Maybe we should iolock the parent. We need to do that anyway to flush + incorporations when the table is full. Yes, that fits the VM model + better. Always lock while updating and preparing to write. Set + writeback once write has started, then unlock. Cool. + Only a block is iolocked when we allocate (to 0), so we cannot lock the parent.. + +21June2010 + Apart from tracking down the remaining bugs, I need to: + 1/ Decide on locking for incorporation and attaching new address to a block + and implement it. + In particular we need to not lose the Dirty flag before the update is done. + 2/ Resolve handling of pinned inode data/index blocks + 3/ Correct handling of empty index blocks, particularly when parent is in + different phase. Make lookup be more careful? + 4/ Wait for there to be enough free segments before allowing allocation. + + 2: Problem is that we cannot handle a pinned inode-data block while the + InoIdx block is pinned in the same phase. + We currently unpin it so it drops off the leaf list. But then we + need to re-pin it when the InoIdx is unpinned or phasefliped, and that + gets ugly. Possible though. + An alternate is to treat it like a parent and keep it off the list + while the InoIdx is pinned/same-phase. So we would need to + re-assess it after unpinning or flipping the InoIdx. That is probably + a lot easier than re-pinning it. + + 1: We would normally set 'dirty' after changing the block. But we need + to differentiate Dirty from Realloc, so we set before adding addresses. + This requires that are careful not to write an index block while there + are pending changes. The fact that pinned children stop any writing, + as do pending addresses in a list should ensure this. + + 3: When an index block becomes empty we need to make sure that + future lookup doesn't get confused by it. Specifically future + index lookup must avoid the block so nothing new gets added. + Possibly a previous block will split again, but this block must remain + unused. + However we cannot update the parent block immedatiately as it might + be in a different phase. + So we must record both "don't touch this" and "where to look instead" + elsewhere - in children. + If the block being deleted is *not* the first child in the parent, + then we direct index lookup to the earlier block. + If the block being deleted *is* the first child in the parent, + then redirect to the second child if there is one and we weren't just there. + If there is no other block we flag the parent as empty and retry + from the top. + We flag a parent as empty with B_EmptyIndex. + + What locks do we need to walk around the sibling list? + the inode private_lock is minimal, but we cannot hold that to take a + iolock - just to get a reference. + I guess we + - iolock the parent + - try to find a good block using private_lock + - get a ref and wait for it. + - check if it is still a good block. If not, start again + + If we find an EmptyIndex block, it must be directly addressed by parent. + It will never be followed by a PrimaryRef block because if there were + such a block, we would have readdressed it back and hidden the EmptyIndex. + So we need to look around for an address in the parent that leads to + a non-EmptyIndex block. + + If all children are empty, we need to make the parent empty. But + what if it is InoIdx? + Maybe I am making this too hard. I could just use i_alloc_sem to + block lookups while truncate is happening. That doesn't address + single block removal e.g. from directories. + So I need to be able to wait for incorporation to happen on an + empty index block. We hold iolock on the parent. If there blocks + on ->uninc, we just process them immediately. If there are blocks on + ->uninc_next, we wait for the checkpoint to complete + + What does lafs_incorporate actually do with EmptyIndex blocks? + Providing that match currently incorp addresses, they just cause + those addresses to disappear. + + If a block is in the uninc list for its parent, then is phase_flipped + and changed and written out it could get a new physaddr before + it is incorporated. + I guess we never allocate a B_Uninc block which is in a different phase + to the parent. Currently we wouldn't do that anyway except in truncate + though memory pressure on index blocks might one day?? + Truncate? We cannot allocate directly in lafs_incorporate. + We should get lafs_cluster_allocate to notice and DTRT. + + Only hash index blocks when they are incorporated. Not needed before then. + When processing an uninc list, if an address appears twice, prefer the one + that isn't EmptyIndex... + +22June2010 + I need a clear picture of the "Steady state" for an internal index block + with it's children. + The internal index block contains 1 or more addresses. For each address there + maybe a child index block. If there is it maybe the head of a list of + blocks with B_PrimaryRef set thus holding the whole list in place until + incorporation happens. + Each of these children can be on either ->uninc_list or ->uninc_next, + or possibly neither if they haven't been queued for writing yet. Any + PrimaryRef block will be Pinned. + + When a child is incorporated and found to be Empty it is flagged as such + and then must never be returned by index lookup. Index lookup will either + add a block to a leaf index so it doesn't appear empty, or will git an EmptyIndex + block and so have to start again from the top. + + When a PrimaryRef block becomes empty it is simply removed from the + PrimaryRef chain so it cannot be found. The space now belongs to the + previous block. + When a non-PrimaryRef block which isn't the first becomes empty it is + flagged and left in place so that following blocks can be found. The + address space now belongs to the previous block. + When the first child (fileaddr matches parent) becomes empty - what? + We could re-address first child but that forces early address change - + old might not be incorp yet + We could re-address the parent, but that doesn't work for InoIdx + We could leave it there with physaddr == 0 + + Last sounds promising. So we never re-address an index block. + + So: From the top. + + Index blocks, Indirect blocks, extent blocks each have an address + that never changes. + When a block becomes over-full it splits - a new block appears with + a new address thus implicitly limiting the address space covered + by the original. + + When an index block becomes empty and has no pinned children it is + marked as EmptyIndex (under IOLock). + When an EmptyIndex is allocated it goes to phys==0 + An EmptyIndex which is not first (->fileaddr != ->parent->fileaddr) + is never used again. Its address space is ceded to the previous + index block - which could split several times... + An EmptyIndex which is first can be re-used. Once it gets pinned + children the EmptyIndex is cleared. + + An Index block always has an entry for the first address. It might + be implicit to phys==0. Loading such a block creates an empty + block. + + InoIdx doesn't get EmptyIndex, rather it gets ->depth=1 + + Indirect *doesn't* store the first address any more. + + Changes: +DONE - remove forcestart from layoutinfo +DONE - remove start-address from Indirect blocks +DONE - only hash index blocks when they are known to be incorporated. +DONE - when incorporating an uninc list, ignore phys==0 if also a block with + same fileaddr and phys!=0. so sort phys==0 first +DONE - Create EmptyIndex flag +DONE - Clear the flag when adding child pin to index block +DONE - avoid EmptyIndex non-start blocks during index lookup +DONE - allow index blocks to be loaded with ->phys==0 +DONE - allow EmptyIndex index block to be "written" to phys 0 +DONE - ensure index lookup finds implicit start address, possibly 0 + +So now after 36 runs + 3 kernel BUG at /home/neilb/work/nfsbrick/fs/module/index.c:1939! + 1 kernel BUG at /home/neilb/work/nfsbrick/fs/module/index.c:403! + 10 kernel BUG at /home/neilb/work/nfsbrick/fs/module/inode.c:605! + 14 kernel BUG at /home/neilb/work/nfsbrick/fs/module/segments.c:1034! + 4 kernel BUG at /home/neilb/work/nfsbrick/fs/module/segments.c:624! + 1 kernel BUG at /home/neilb/work/nfsbrick/fs/module/super.c:657! + 3 SysRq : Resetting + + +index.c:1939 + block 0/2 is Realloc and being allocated from cluster_flush while + parent is not Realloc or dirty + That is bad as Realloc gets set in lafs_allocated_block ... except + that the code was bad. FIXED. + +index.c:403 + cleaner is pinning a block (299/25) which is not Realloc, + and phase isn't locked. We are only meant to pin data blocks + for updates while holding a phase lock. + Ahhh - bad code again. FIXED + +inode.c:605 + Truncate doesn't clean up properly. + 327 has 60+1 + 331 has 108+1 + 327 has 34+1 + 327 has 60+1 + No sign of any children. + + Very weird. Signed in incorporation going wrong. + Added more debugging. + +Found 4084 4 12 at 890 +Added 4084 4 12 +Found 4089 4 16 at 878 +Added 4089 4 16 +Found 4094 2 20 at 866 +Added 4094 2 20 +Found 2561 2 22 at 854 +Added 514 2 22 +Found 2564 4 24 at 842 +Found 2569 2 28 at 830 +Found 0 0 0 at 818 + +Why are 2564 etc lost? No sign of alloc-to-0 + +segments.c:1034 + no free segments - need to wait somewhere. + +segments.c:624 + allocated_blocks has gone over free_blocks! + in lafs_prealloc/reserve_block/free_get/ss_put/new_segment.../checkpoint. + Wanted CleanSpace to reserve the youthblk + Maybe related to not waiting - ignore for now. + +super.c:657 + block 0/2 was dirty but not pinned. Should not happen to inodes. + block 0/0 was Pinned because it had a child - as above. + + Maybe we don't carry the pin across when we collapse dir + into inode??... looks quite likely + + +23 June 2010 + +116 runs. + 1 BUG: unable to handle kernel paging request at 6b6b6bfb + 1 kernel BUG at /home/neilb/work/nfsbrick/fs/module/block.c:497! + 3 kernel BUG at /home/neilb/work/nfsbrick/fs/module/dir.c:710! + 7 kernel BUG at /home/neilb/work/nfsbrick/fs/module/inode.c:606! + 61 kernel BUG at /home/neilb/work/nfsbrick/fs/module/segments.c:1034! + 1 kernel BUG at /home/neilb/work/nfsbrick/fs/module/super.c:657! + 42 SysRq : Resetting + + +6b6b6bfb: + invalidate_inode_buffers called on at shutdown. + Still wierd + +block.c:497 FIXED?? + block 16/1 is not dirty with no credits. + Maybe writepage got to it? + +dir.c:710 + ouch! dir lookup failed in unlink. + No real hints. Must be hash based - some off-by-one probably. + Need to stare at the code. + +inode.c:606 FIXED + Blocks still present after truncate. + typically about 60, but in 1 case '4'. No index blocks. + So probably content of second index block. + Yes, lafs_leaf_next was doing the wrong thing for addresses + before start of block. + +segments.c:1034 + same old + +super.c:657 FIXED + dir inode 0/2 is still Dirty but not pinned. + Maybe lafs_dirty_inode should be pinning the block + + But now this triggers for 16/X still dirty. + + +How and when to write blocks in a SegmentMap file? + - We don't want normal write-back to write them unless they have + no references + - We need to write them in tail of checkpoint, and index info must + follow in the next checkpoint. + +lafs_space_alloc is called from + - mark_cleaning: always CleanSpace, failure is OK + - lafs_cluster_update_pin: ReleaseSpace. -EAGAIN is OK (CHECK THIS) but failure + is not - or shouldn't be. + - lafs_allocated_block: CleanSpace, checking if parent of Realloc block + can be saved separately from any Dirty version. Failure OK, blocking not. + - lafs_prealloc - general space allocation. + - +lafs_cluster_update_pin is call from: + - lafs_create, lafs_link, lafs_unlink, lafs_rmdir, lafs_symlink, lafs_mkdir + lafs_mknod, lafs_rename, + - lafs_write_inode + So best to return -EAGAIN, and it should be handled adequately. + +lafs_prealloc is called from: + - lafs_reserve_block, after modifying the alloc_type extensively. + - lafs_phase_flip to re-fill the 'next' credits. If they aren't available + we simply pin all children so they aren't needed. + So failure is OK + - lafs_seg_ref_block: getting CleanSpace to save segusage blocks. + If this fails .. what?? lafs_reserve_block fails. so... + +lafs_reserve_block is called from + - mark_cleaning - CleanSpace + - lafs_pin_dblock - type is passed int... + - lafs_prepare_write - on failure write will fail or retry after checkpoint + - lafs_inode_handle_orphan - to help with delete. On failure we allow + cleaning to happen + - lafs_seg_move - should be elsewhere. Failure BAD ! + - lafs_free_get - as above, failure BAD + - clean_free - update youth for new clean blocks - Failure BAD + +lafs_pin_dblock is called from + - dir_create_pin - fail or again handled + - dir_delete_pin + - dir_update_pin + - lafs_create etc + - lafs_dir_handle_orphan + - choose_free_inum + - inode_map_new_pin + - lafs_new_inode + ... + - lafs_orphan_release !! cannot handle failure + - roll_block should use AccountSpace + +So: It seems we need a new allocation class that will never fail. + Maybe it is allowed to BUG though? + AccountSpace - i.e. space need to account for the use of space. + Must never ever fail. + +Then we must ask where blocking should happen on -EAGAIN. + dir.c does "lafs_checkpoint_unlock_wait", then tries again. + prepare_write does too. + +For that to work we must start a checkpoint on returned EAGAIN.... Don't +we want to wait for some cleaning to happen first though? Maybe an extra +flag, and a count of the number of empty (but not clean) blocks. + +- Should I skip orphan handling when tight on space? Probably not. It will + just keep failing while we keep cleaning... +- roll_block should use account_space .. or not + +- lafs_space_alloc simply allocates space, or fails. 'why' is used to + guide watermark choice. +- lafs_prealloc allocates space to a block and all its parents base on + 'why' for watermarks. It either succeeds or failed. + +- lafs_cluster_update_pin and lafs_reserve_block decide whether to respond + to failure as -ENOSPC or -EAGAIN based on 'why'. + +- lafs_pin_dblock simply passes on the failure, which must be handled. + +So: What to do when we return -EAGAIN? + We need to wait until there are *enough* clean segments, then cause a checkpoint + so they become free. + So a flag that says 'waiting for free space' and a count of segments + required. + + But how do we differentiate ENOSPC and EAGAIN for NewSpace requests? + Maybe we don't ?? Or do it later. + +Still to do: +- Audit all AccountSpace and justify them + + lafs_seg_move is probably wrong. Should have allocated when the + free segment was allocated +- lafs_orphan_release called lafs_pin_dblock but cannot handle failure +- Need to wait not just for "enough space" but for "enough clean segments". + +- how is 'free_blocks' set - what does this tell us?? + + free_blocks is the sum of known-clean segments. + We probably want: + clean segments + remainder for each active segment + then reserve some segments for cleaning. + And separate 'allocated_block' for each ? + +Notes: + segments.c:647 fired: AccountSpace had no space available. + Reserving space to write the segusage of youth block for a newly + allocated segment. + super.c:657 STILL + 0/2 is Dirty but not Pinned Maybe we need PinPending + soft lockup + in the cleaner! + Maybe I need cond_resched?? + +Maybe I want two separate 'free_blocks' counters. + One that includes all free blocks for use in 'df' etc. + One that only includes completely free segments for use in allocation... + + +24 June 2010 + + Something is wrong with cleaning and segment tracking + We have 5 free segments and we get them all without writing + anything! We consumer them all with cluster_flush! + It seems that the root inode is not changing phase! + Nothing is on the phase leafs. + Most children are in Writeback on cluster. and are Realloc + Others have pinned children. + They are all in 'cluster', but 'flush' doesn't flush them, + so they must be in a different clister??? Is the cleaner still + cleaning? Yes, they are on the cleaner 'wc' list so they are + queued but not flush for the cleaner. + +25 June 2010 + At last it looks like I nearly have a working FS. Out of 361 test + runs, 9 triggered BUGS and one hung at umount. + + I need a new TODO list, starting with 6 jul 2007(!) and adding any + FIXMEs etc. + +DONE 0/ start TODO list +DONE 1/ document new bugs +DONE 2/ Tidy up all recent changes as individual commits. + 3/ clean up the various 'scratch' patches discarding any tracing that + I don't think I need, and making the rest 'dprintk' etc. + 4/ check in this README file + 5/ Write rest of the TODO list + + 6/ soft locking in unlink call. + EIP is at lafs_hash_name+0xa5/0x10f [lafs] + [] hash_piece+0x18/0x65 [lafs] + [] lafs_dir_del_ent+0x4e/0x404 [lafs] + [] ? lafs_hash_name+0xfa/0x10f [lafs] + [] dir_delete_commit+0xdb/0x187 [lafs] + [] lafs_unlink+0x144/0x1f4 [lafs] + [] vfs_unlink+0x4e/0x92 + + Would we be spinning on -EAGAIN ?? 4 empty segment are present. + + 7/ block.c:624 in lafs_dirty_iblock - no pin, no credits + truncate -> lafs_invalidate_page -> lafs_erase_dblock -> lafs_allocated_block / lafs_dirty_iblock +Allocated [ce44f240]327/144(1499)r2E:Writeback,PhysValid clean2(1) cleaning(1) -> 0 +SEGMOVE 1499 0 +Oh dear: [ce44f240]327/144(0)r2E:Writeback,PhysValid clean2(1) cleaning(1) +.......: [cfb69180]327/0(349)r2F:Index(1),Pinned,Phase0,Valid,PhysValid{0,0}[0] child(1) leaf(1) +Why have I no credits? +/home/neilb/work/nfsbrick/fs/module/block.c:624: [cfb69180]327/0(349)r2F:Index(1),Pinned,Phase0,Valid,Dirty,PhysValid{0,0}[0] child(1 +) leaf(1) + + Cleaning is racing with truncate, and that cannot happen!! + + 8/ looping in do_checkpoint + root is still i Phase1 because 0/2 is in Phase 1 + [cfa57c58]0/2(2078)r1E:Pinned,Phase1,WPhase0,Valid,Dirty,C,CI,CN,CNI,UninCredit,IOLock,PhysValid writepageflush(1) + Seems to be waiting for writeback, but writeback is clear. + Need to call lafs_io_wake in lafs_iocheck_writeback for when + it is called by lafs_writepage + + 9/ cluster.c:478 + flush_data_To_inode finds Realloc (not dirty) block + and InoIdx block is not Valid. + [cfb5ef50]2/0(3)r1F:Index(0),Pinned,Phase1,InoIdx,SegRef,C,CI,CN,CNI,IOLock,OnFree +,PhysValid{0,1}[0] child(1) + I wonder if it was PinPending, or where it was IOLocked (or if). + + I guess we truncated, then added data, then tried to clean. + Probably just a bad 'bug' given recent changes. + +10/ inode.c:606 + Deleting inode 328: 2+0+0 1+0 + + 2 level index. + first index at level 1 was full and prune properly. + Nothing else found empty. + Somehow the second index block and contents were lost. + +11/ super.c:657 + Root still pinned at unmount. + 0/2 is Dirty: [cfa53c58]0/2(1750)r0E:Valid,Dirty,CN,CNI,UninCredit,PhysValid + [cfa5fc58]0/2(2852)r0E:Valid,Dirty,SegRef,CN,CNI,UninCredit,PhysValid + [cfa53c58]0/2(3570)r0E:Valid,Dirty,CN,CNI,UninCredit,PhysValid + [cfa53828]0/2(2969)r0E:Valid,Dirty,CN,CNI,UninCredit,PhysValid + maybe dir-orphan handling stuffed up + +12/ timeout/showstate in unmount + umount is in sync_inodes / do_writepages / lafs_writepage / lafs_iolock_written + That looks similar to 8 + +13/ delete_inode should wait for pending truncate to complete. + Document I_Trunc somewhere - including that i_mutex is needed to set it. + Verify that assertion. + +14/ Review writepage and flush and make sure we flush often enough but + not too often. diff --git a/block.c b/block.c index 9f8680b..d0c9687 100644 --- a/block.c +++ b/block.c @@ -426,6 +426,9 @@ lafs_pin_dblock(struct datablock *b, int alloc_type) * - reference the old segment * - update flags and pointers. */ + /* FIXME I probably need an iolock here to avoid racing with + * lafs_cluster_allocate which can clear dirty and so lose credits. + */ int err; struct fs *fs = fs_from_inode(b->b.inode); struct block *blk; diff --git a/clean.c b/clean.c index 069c54f..e5ed195 100644 --- a/clean.c +++ b/clean.c @@ -394,6 +394,10 @@ static int try_clean(struct fs *fs, struct toclean *tc) } else { dprintk("got the inode\n"); /* Minor optimisation for files that have shrunk */ + /* Actually this is critical for handling truncation + * properly. We don't want to even 'get' a block beyond + * EOF, certainly not after the truncate_inode_pages. + */ if (LAFSI(ino)->type >= TypeBase && ((loff_t)bnum << ino->i_blkbits) >= i_size_read(ino)) goto skip; @@ -401,6 +405,7 @@ static int try_clean(struct fs *fs, struct toclean *tc) MKREF(cleaning)); if (b == NULL) break; + /* FIXME repeat size test? Is any locking appropriate? */ if (list_empty(&b->cleaning)) { list_add(&b->cleaning, &tc->cleaning); igrab(ino); diff --git a/index.c b/index.c index 73936f8..6186db1 100644 --- a/index.c +++ b/index.c @@ -109,7 +109,7 @@ void lafs_release_index(struct list_head *head) /* This is the other place that index blocks get freed. * They are freed either by memory pressure in lafs_shrinker * above, or when an inode is destroyed by this code. - * The need to both obey the same locking rules, and ensure + * They need to both obey the same locking rules, and ensure * blocks are removed from each of siblings, lru, and hash. * This list is an inode's free_index and are linked on * b.siblings. There are all on the freelist.lru diff --git a/inode.c b/inode.c index 6c0f062..d40c27a 100644 --- a/inode.c +++ b/inode.c @@ -722,7 +722,7 @@ void lafs_inode_handle_orphan(struct datablock *b) ib = lafs_leaf_find(ino, trunc_next, ADOPT, &next_trunc, ASYNC, MKREF(inode_handle_orphan3)); - + /* now hold an iolock on ib */ if (IS_ERR(ib)) return; @@ -792,6 +792,9 @@ void lafs_inode_handle_orphan(struct datablock *b) if (ib->uninc_table.pending_cnt == 0 && ib->uninc == NULL) { lafs_dirty_iblock(ib); + /* FIXME this just removes 8 blocks at a time, + * which is not enough + */ lafs_walk_leaf_index(ib, prune_some, ib); } if (test_bit(B_Dirty, &ib->b.flags)) diff --git a/modify.c b/modify.c index 33b2cf8..e14a12c 100644 --- a/modify.c +++ b/modify.c @@ -430,6 +430,10 @@ static void grow_index_tree(struct indexblock *ib, struct indexblock *new) LAFSI(new->b.inode)->depth++; ((struct la_inode*)(ibuf))->depth = LAFSI(new->b.inode)->depth; + /* There is no need to set PrimaryRef as the first child in an + * index block is implicitly incorporated - at least enough to be + * found. So the ->parent link holds our place in the tree + */ unmap_iblock(new, ibuf); LAFS_BUG(LAFSI(new->b.inode)->iblock != ib, &ib->b); @@ -666,7 +670,7 @@ static int check_leaf(void *data, u32 addr, u64 phys, int len) /* We are initialising the structure */ memset(li, 0, sizeof(*li)); li->firstaddr = addr; - li->blksize = len - 2; /* exclude type filed */ + li->blksize = len - 2; /* exclude type field */ return 0; } if (len == 0) @@ -868,7 +872,7 @@ static u32 walk_extent(u32 addr, char **bufp, int len, struct uninc *ui, /* just submit next extent from ui */ s32 overlap; int hlen = ui->pending_addr[uinum].cnt - uioffset; - /* If these is an with the current extent, only + /* If there is an overlap with the current extent, only * add the overlapped portion. */ dprintk("(%llu %lu %u) (%llu %lu %u) + %d = %d (%d)\n", @@ -1534,7 +1538,7 @@ static int do_incorporate_internal(struct fs *fs, struct indexblock *ib, struct block *b; /* This block really needs to be incorporated, or * we might not be able to find it - it might be - * a primary for other blocks to. + * a primary for other blocks too. * So do a minimal incorporation of this block. * That must succeed as it will be an addition to an * empty block, or a replacement. @@ -1897,6 +1901,10 @@ int __must_check lafs_prealloc(struct block *blk, int why) else if (!test_and_set_bit(B_ICredit, &b->flags)) credits--; } + /* FIXME: should I check PinPending rather than the different + * inode types? + * And do I only need NCredit if already dirty ... or something + */ if (test_bit(B_Index, &b->flags) || LAFSI(b->inode)->type == TypeInodeFile || LAFSI(b->inode)->type == TypeOrphanList || diff --git a/roll.c b/roll.c index 1092eb8..5ed46ed 100644 --- a/roll.c +++ b/roll.c @@ -346,11 +346,13 @@ roll_block(struct fs *fs, int fsnum, int inum, int trunc, int flg, 0, fs->phase, 1); blk->b.physaddr = baddr; lafs_dirty_iblock(blk->b.parent); + /* FIXME maybe set Writeback and unlock */ if (lafs_add_block_address(fs, &blk->b) == 0) /* FIXME if the table becomes full, we have a problem... */ LAFS_BUG(1, &blk->b); dprintk("Allocated block %lu to %llu\n", (unsigned long)bnum, baddr); + /* FIXME maybe clear Writeback instead */ lafs_iounlock_block(&blk->b); clear_bit(B_PinPending, &blk->b.flags); @@ -588,6 +590,7 @@ lafs_mount(struct fs *fs) fs->devs[d].segsum = lafs_iget(fs->prime_sb, fs->devs[d].usage_inum, SYNC); + /* FIXME check this is a segusage file */ } lafs_checkpoint_lock(fs); err = roll_forward(fs); diff --git a/state.h b/state.h index 0614372..6dd3351 100644 --- a/state.h +++ b/state.h @@ -473,7 +473,7 @@ enum { B_PrimaryRef, /* This indexblock has not been incorporated into the * parent, and so can only be found by being a sibling * of a 'primary' which is incorporated. Consequently - * a counted references is held on ->siblings.next, + * a counted reference is held on ->siblings.prev, * which is either the primary or another block holding * a PrimaryRef. */ diff --git a/super.c b/super.c index 96f3df0..8951caf 100644 --- a/super.c +++ b/super.c @@ -695,6 +695,7 @@ lafs_put_super(struct super_block *sb) */ set_bit(CleanerDisabled, &fs->fsstate); + /* FIXME maybe I want these tests in a spinlock ?? */ wait_event(fs->async_complete, !test_bit(OrphansRunning, &fs->fsstate) && list_empty(&fs->pending_orphans) && -- 2.39.5