This is a performance and correctness fix against the writeback paths.
The writeback code has competing requirements. Sometimes it is used
for "memory cleansing": kupdate, bdflush, writer throttling, page
allocator writeback, etc. And sometimes this same code is used for
data integrity pruposes: fsync, msync, fdatasync, sync, umount, various
other kernel-internal uses.
The problem is: how to handle a dirty buffer or page which is currently
under writeback.
For memory cleansing, we just want to skip that buffer/page and go onto
the next one. But for sync, we must wait on the old writeback and then
start new writeback.
mpage_writepages() is current correct for cleansing, but incorrect for
sync. block_write_full_page() is currently correct for sync, but
inefficient for cleansing.
The fix is fairly simple.
- In mpage_writepages(), don't skip the page is it's a sync
operation.
- In block_write_full_page(), skip the buffer if it is a sync
operation. And return -EAGAIN to tell the caller that the writeout
didn't work out. The caller must then set the page dirty again and
move it onto mapping->dirty_pages.
This is an extension of the writepage API: writepage can now return
EAGAIN. There are only three callers, and they have been updated.
fail_writepage() and ext3_writepage() were actually doing this by
hand. They have been changed to return -EAGAIN. NTFS will want to
be able to return -EAGAIN from its writepage as well.
- A sticky question is: how to tell the writeout code which mode it
is operating in? Cleansing or sync?
It's such a tiny code change that I didn't have the heart to go and
propagate a `mode' argument down every instance of writepages() and
writepage() in the kernel. So I passed it in via current->flags.
Incidentally, the occurrence of a locked-and-dirty buffer in
block_write_full_page() is fairly rare: normally the collision avoidance
happens at the address_space level, via PageWriteback. But some
mappings (blockdevs, ext3 files, etc) have their dirty buffers written
out via submit_bh(). It is these buffers which can stall
block_write_full_page().
This wart will be pretty intrusive to fix. ext3 needs to become fully
page-based (ugh. It's a block-based journalling filesystem, and pages
are unnatural). blockdev mappings are still written out by buffers
because that's how filesystems use them. Putting _all_ metadata
(indirects, inodes, superblocks, etc) into standalone address_spaces
would fix that up.
- filemap_fdatawrite() sets PF_SYNC. So filemap_fdatawrite() is the
kernel function which will start writeback against a mapping for
"data integrity" purposes, whereas the unexported, internal-only
do_writepages() is the writeback function which is used for memory
cleansing. This difference is the reason why I didn't consolidate
those functions ages ago...
- Lots of code paths had a bogus extra call to filemap_fdatawait(),
which I previously added in a moment of weak-headedness. They have
all been removed.
/* We need to protect against concurrent writers.. */
down(&inode->i_sem);
- ret = filemap_fdatawait(inode->i_mapping);
- err = filemap_fdatawrite(inode->i_mapping);
- if (!ret)
- ret = err;
+ ret = filemap_fdatawrite(inode->i_mapping);
err = file->f_op->fsync(file, dentry, 0);
if (!ret)
ret = err;
goto out_putf;
down(&inode->i_sem);
- ret = filemap_fdatawait(inode->i_mapping);
- err = filemap_fdatawrite(inode->i_mapping);
- if (!ret)
- ret = err;
+ ret = filemap_fdatawrite(inode->i_mapping);
err = file->f_op->fsync(file, dentry, 1);
if (!ret)
ret = err;
* the page lock, whoever dirtied the buffers may decide to clean them
* again at any time. We handle that by only looking at the buffer
* state inside lock_buffer().
+ *
+ * If block_write_full_page() is called for regular writeback
+ * (called_for_sync() is false) then it will return -EAGAIN for a locked
+ * buffer. This only can happen if someone has written the buffer directly,
+ * with submit_bh(). At the address_space level PageWriteback prevents this
+ * contention from occurring.
*/
static int __block_write_full_page(struct inode *inode,
struct page *page, get_block_t *get_block)
{
int err;
+ int ret = 0;
unsigned long block;
unsigned long last_block;
struct buffer_head *bh, *head;
do {
get_bh(bh);
if (buffer_mapped(bh) && buffer_dirty(bh)) {
- lock_buffer(bh);
+ if (called_for_sync()) {
+ lock_buffer(bh);
+ } else {
+ if (test_set_buffer_locked(bh)) {
+ ret = -EAGAIN;
+ continue;
+ }
+ }
if (test_clear_buffer_dirty(bh)) {
if (!buffer_uptodate(bh))
buffer_error();
unlock_buffer(bh);
}
}
- bh = bh->b_this_page;
- } while (bh != head);
+ } while ((bh = bh->b_this_page) != head);
BUG_ON(PageWriteback(page));
SetPageWriteback(page); /* Keeps try_to_free_buffers() away */
SetPageUptodate(page);
end_page_writeback(page);
}
+ if (err == 0)
+ return ret;
return err;
recover:
/*
/*
* We have to fail this writepage to avoid cross-fs transactions.
- * Put the page back on mapping->dirty_pages, but leave its buffer's
- * dirty state as-is.
+ * Return EAGAIN so the caller will the page back on
+ * mapping->dirty_pages. The page's buffers' dirty state will be left
+ * as-is.
*/
- __set_page_dirty_nobuffers(page);
+ ret = -EAGAIN;
unlock_page(page);
return ret;
}
}
struct address_space_operations ext3_writeback_aops = {
- .readpage = ext3_readpage, /* BKL not held. Don't need */
- .readpages = ext3_readpages, /* BKL not held. Don't need */
- .writepage = ext3_writepage, /* BKL not held. We take it */
+ .readpage = ext3_readpage, /* BKL not held. Don't need */
+ .readpages = ext3_readpages, /* BKL not held. Don't need */
+ .writepage = ext3_writepage, /* BKL not held. We take it */
.writepages = ext3_writepages, /* BKL not held. Don't need */
.sync_page = block_sync_page,
.prepare_write = ext3_prepare_write, /* BKL not held. We take it */
/*
* write out dirty pages of bmap
*/
- filemap_fdatawait(ipbmap->i_mapping);
filemap_fdatawrite(ipbmap->i_mapping);
filemap_fdatawait(ipbmap->i_mapping);
/*
* write out dirty pages of imap
*/
- filemap_fdatawait(ipimap->i_mapping);
filemap_fdatawrite(ipimap->i_mapping);
filemap_fdatawait(ipimap->i_mapping);
jERROR(1, ("diFreeSpecial called with NULL ip!\n"));
return;
}
- filemap_fdatawait(ip->i_mapping);
filemap_fdatawrite(ip->i_mapping);
filemap_fdatawait(ip->i_mapping);
truncate_inode_pages(ip->i_mapping, 0);
* We need to make sure all of the "written" metapages
* actually make it to disk
*/
- filemap_fdatawait(sbi->ipbmap->i_mapping);
- filemap_fdatawait(sbi->ipimap->i_mapping);
- filemap_fdatawait(sbi->direct_inode->i_mapping);
filemap_fdatawrite(sbi->ipbmap->i_mapping);
filemap_fdatawrite(sbi->ipimap->i_mapping);
filemap_fdatawrite(sbi->direct_inode->i_mapping);
*
* if ((!S_ISDIR(ip->i_mode))
* && (tblk->flag & COMMIT_DELETE) == 0) {
- * filemap_fdatawait(ip->i_mapping);
* filemap_fdatawrite(ip->i_mapping);
* filemap_fdatawait(ip->i_mapping);
* }
* Make sure all metadata makes it to disk before we mark
* the superblock as clean
*/
- filemap_fdatawait(sbi->direct_inode->i_mapping);
filemap_fdatawrite(sbi->direct_inode->i_mapping);
filemap_fdatawait(sbi->direct_inode->i_mapping);
*/
dbSync(sbi->ipbmap);
diSync(sbi->ipimap);
- filemap_fdatawait(sbi->direct_inode->i_mapping);
filemap_fdatawrite(sbi->direct_inode->i_mapping);
filemap_fdatawait(sbi->direct_inode->i_mapping);
* We need to clean out the direct_inode pages since this inode
* is not in the inode hash.
*/
- filemap_fdatawait(sbi->direct_inode->i_mapping);
filemap_fdatawrite(sbi->direct_inode->i_mapping);
filemap_fdatawait(sbi->direct_inode->i_mapping);
truncate_inode_pages(sbi->direct_mapping, 0);
jERROR(1, ("jfs_umount failed with return code %d\n", rc));
}
out_mount_failed:
- filemap_fdatawait(sbi->direct_inode->i_mapping);
filemap_fdatawrite(sbi->direct_inode->i_mapping);
filemap_fdatawait(sbi->direct_inode->i_mapping);
truncate_inode_pages(sbi->direct_mapping, 0);
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
+#include <linux/writeback.h>
#include <linux/pagevec.h>
/*
sector_t last_block_in_bio = 0;
int ret = 0;
int done = 0;
+ int sync = called_for_sync();
struct pagevec pvec;
int (*writepage)(struct page *);
struct page *page = list_entry(mapping->io_pages.prev,
struct page, list);
list_del(&page->list);
- if (PageWriteback(page)) {
+ if (PageWriteback(page) && !sync) {
if (PageDirty(page)) {
list_add(&page->list, &mapping->dirty_pages);
continue;
lock_page(page);
+ if (sync)
+ wait_on_page_writeback(page);
+
if (page->mapping && !PageWriteback(page) &&
TestClearPageDirty(page)) {
if (writepage) {
pagevec_deactivate_inactive(&pvec);
page = NULL;
}
+ if (ret == -EAGAIN && page) {
+ __set_page_dirty_nobuffers(page);
+ ret = 0;
+ }
if (ret || (nr_to_write && --(*nr_to_write) <= 0))
done = 1;
} else {
* Flush all pending writes before doing anything
* with locks..
*/
- status = filemap_fdatawait(inode->i_mapping);
- status2 = filemap_fdatawrite(inode->i_mapping);
- if (!status)
- status = status2;
+ status = filemap_fdatawrite(inode->i_mapping);
down(&inode->i_sem);
status2 = nfs_wb_all(inode);
if (!status)
*/
out_ok:
if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
- filemap_fdatawait(inode->i_mapping);
filemap_fdatawrite(inode->i_mapping);
down(&inode->i_sem);
nfs_wb_all(inode); /* we may have slept */
if (!S_ISREG(inode->i_mode))
attr->ia_valid &= ~ATTR_SIZE;
- filemap_fdatawait(inode->i_mapping);
filemap_fdatawrite(inode->i_mapping);
error = nfs_wb_all(inode);
filemap_fdatawait(inode->i_mapping);
struct inode *inode = dp->d_inode;
int (*fsync) (struct file *, struct dentry *, int);
- filemap_fdatawait(inode->i_mapping);
filemap_fdatawrite(inode->i_mapping);
if (fop && (fsync = fop->fsync))
fsync(filp, dp, 0);
/* We must flush any dirty pages now as we won't be able to
write anything after close. mmap can trigger this.
"openers" should perhaps include mmap'ers ... */
- filemap_fdatawait(inode->i_mapping);
filemap_fdatawrite(inode->i_mapping);
filemap_fdatawait(inode->i_mapping);
smb_close(inode);
DENTRY_PATH(dentry),
(long) inode->i_size, (long) attr->ia_size);
- filemap_fdatawait(inode->i_mapping);
filemap_fdatawrite(inode->i_mapping);
filemap_fdatawait(inode->i_mapping);
mark_buffer_dirty_inode(bh, inode);
udf_release_data(bh);
- inode->i_data.a_ops->writepage(page);
+ if (inode->i_data.a_ops->writepage(page) == -EAGAIN)
+ __set_page_dirty_nobuffers(page);
page_cache_release(page);
mark_inode_dirty(inode);
#define PF_FREEZE 0x00010000 /* this task should be frozen for suspend */
#define PF_IOTHREAD 0x00020000 /* this thread is needed for doing I/O to swap */
#define PF_FROZEN 0x00040000 /* frozen for system suspend */
+#define PF_SYNC 0x00080000 /* performing fsync(), etc */
/*
* Ptrace flags
read-only. */
+/*
+ * Tell the writeback paths that they are being called for a "data integrity"
+ * operation such as fsync().
+ */
+static inline int called_for_sync(void)
+{
+ return current->flags & PF_SYNC;
+}
+
#endif /* WRITEBACK_H */
SetPageReferenced(page);
}
- /* Set the page dirty again, unlock */
- set_page_dirty(page);
unlock_page(page);
- return 0;
+ return -EAGAIN; /* It will be set dirty again */
}
EXPORT_SYMBOL(fail_writepage);
/**
- * filemap_fdatawrite - walk the list of dirty pages of the given address space
- * and writepage() all of them.
+ * filemap_fdatawrite - start writeback against all of a mapping's dirty pages
+ * @mapping: address space structure to write
*
- * @mapping: address space structure to write
+ * This is a "data integrity" operation, as opposed to a regular memory
+ * cleansing writeback. The difference between these two operations is that
+ * if a dirty page/buffer is encountered, it must be waited upon, and not just
+ * skipped over.
*
+ * The PF_SYNC flag is set across this operation and the various functions
+ * which care about this distinction must use called_for_sync() to find out
+ * which behaviour they should implement.
*/
int filemap_fdatawrite(struct address_space *mapping)
{
- return do_writepages(mapping, NULL);
+ int ret;
+
+ current->flags |= PF_SYNC;
+ ret = do_writepages(mapping, NULL);
+ current->flags &= ~PF_SYNC;
+ return ret;
}
/**
- * filemap_fdatawait - walk the list of locked pages of the given address space
- * and wait for all of them.
- *
- * @mapping: address space structure to wait for
- *
+ * filemap_fdatawait - walk the list of locked pages of the given address
+ * space and wait for all of them.
+ * @mapping: address space structure to wait for
*/
int filemap_fdatawait(struct address_space * mapping)
{
write_lock(&mapping->page_lock);
while (!list_empty(&mapping->locked_pages)) {
- struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
+ struct page *page;
+ page = list_entry(mapping->locked_pages.next,struct page,list);
list_del(&page->list);
if (PageDirty(page))
list_add(&page->list, &mapping->dirty_pages);
int err;
down(&inode->i_sem);
- ret = filemap_fdatawait(inode->i_mapping);
- err = filemap_fdatawrite(inode->i_mapping);
- if (!ret)
- ret = err;
+ ret = filemap_fdatawrite(inode->i_mapping);
if (flags & MS_SYNC) {
if (file->f_op && file->f_op->fsync) {
err = file->f_op->fsync(file, file->f_dentry, 1);
#if 0
if (!PageWriteback(page) && PageDirty(page)) {
lock_page(page);
- if (!PageWriteback(page) && TestClearPageDirty(page))
- page->mapping->a_ops->writepage(page);
- else
+ if (!PageWriteback(page) && TestClearPageDirty(page)) {
+ int ret;
+
+ ret = page->mapping->a_ops->writepage(page);
+ if (ret == -EAGAIN)
+ __set_page_dirty_nobuffers(page);
+ } else {
unlock_page(page);
+ }
}
#endif
}
page_cache_get(page);
write_unlock(&mapping->page_lock);
ret = mapping->a_ops->writepage(page);
+ if (ret == -EAGAIN) {
+ __set_page_dirty_nobuffers(page);
+ ret = 0;
+ }
if (ret == 0 && wait) {
wait_on_page_writeback(page);
if (PageError(page))