]> git.neil.brown.name Git - history.git/commitdiff
[PATCH] no-buffer-head ext2 option
authorAndrew Morton <akpm@digeo.com>
Fri, 22 Nov 2002 03:32:45 +0000 (19:32 -0800)
committerLinus Torvalds <torvalds@penguin.transmeta.com>
Fri, 22 Nov 2002 03:32:45 +0000 (19:32 -0800)
Implements a new set of block address_space_operations which will never
attach buffer_heads to file pagecache.  These can be turned on for ext2
with the `nobh' mount option.

During write-intensive testing on a 7G machine, total buffer_head
storage remained below 0.3 megabytes.  And those buffer_heads are
against ZONE_NORMAL pagecache and will be reclaimed by ZONE_NORMAL
memory pressure.

This work is, of course, a special for the huge highmem machines.
Possibly it obsoletes the buffer_heads_over_limit stuff (which doesn't
work terribly well), but that code is simple, and will provide relief
for other filesystems.

It should be noted that the nobh_prepare_write() function and the
PageMappedToDisk() infrastructure is what is needed to solve the
problem of user data corruption when the filesystem which backs a
sparse MAP_SHARED mapping runs out of space.  We can use this code in
filemap_nopage() to ensure that all mapped pages have space allocated
on-disk.  Deliver SIGBUS on ENOSPC.

This will require a new address_space op, I expect.

fs/buffer.c
fs/ext2/ext2.h
fs/ext2/inode.c
fs/ext2/namei.c
fs/ext2/super.c
fs/mpage.c
include/linux/buffer_head.h
include/linux/ext2_fs.h
include/linux/page-flags.h
mm/page_alloc.c
mm/truncate.c

index a0c7f850482ff0f4274bcf689cfa6df10a97f550..aacd45b8b2606dd28cf210498476b5034780116e 100644 (file)
@@ -1964,6 +1964,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
        unsigned int blocksize, blocks;
        int nr, i;
+       int fully_mapped = 1;
 
        if (!PageLocked(page))
                PAGE_BUG(page);
@@ -1986,6 +1987,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
                        continue;
 
                if (!buffer_mapped(bh)) {
+                       fully_mapped = 0;
                        if (iblock < lblock) {
                                if (get_block(inode, iblock, bh, 0))
                                        SetPageError(page);
@@ -2008,6 +2010,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
                arr[nr++] = bh;
        } while (i++, iblock++, (bh = bh->b_this_page) != head);
 
+       if (fully_mapped)
+               SetPageMappedToDisk(page);
+
        if (!nr) {
                /*
                 * All buffers are uptodate - we can set the page uptodate
@@ -2204,6 +2209,198 @@ int generic_commit_write(struct file *file, struct page *page,
        return 0;
 }
 
+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ */
+int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
+                       get_block_t *get_block)
+{
+       struct inode *inode = page->mapping->host;
+       const unsigned blkbits = inode->i_blkbits;
+       const unsigned blocksize = 1 << blkbits;
+       struct buffer_head map_bh;
+       struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
+       unsigned block_in_page;
+       unsigned block_start;
+       sector_t block_in_file;
+       char *kaddr;
+       int nr_reads = 0;
+       int i;
+       int ret = 0;
+       int is_mapped_to_disk = 1;
+       int dirtied_it = 0;
+
+       if (PageMappedToDisk(page))
+               return 0;
+
+       block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+       map_bh.b_page = page;
+
+       /*
+        * We loop across all blocks in the page, whether or not they are
+        * part of the affected region.  This is so we can discover if the
+        * page is fully mapped-to-disk.
+        */
+       for (block_start = 0, block_in_page = 0;
+                 block_start < PAGE_CACHE_SIZE;
+                 block_in_page++, block_start += blocksize) {
+               unsigned block_end = block_start + blocksize;
+               int create;
+
+               map_bh.b_state = 0;
+               create = 1;
+               if (block_start >= to)
+                       create = 0;
+               ret = get_block(inode, block_in_file + block_in_page,
+                                       &map_bh, create);
+               if (ret)
+                       goto failed;
+               if (!buffer_mapped(&map_bh))
+                       is_mapped_to_disk = 0;
+               if (buffer_new(&map_bh))
+                       unmap_underlying_metadata(map_bh.b_bdev,
+                                                       map_bh.b_blocknr);
+               if (PageUptodate(page))
+                       continue;
+               if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
+                       kaddr = kmap_atomic(page, KM_USER0);
+                       if (block_start < from) {
+                               memset(kaddr+block_start, 0, from-block_start);
+                               dirtied_it = 1;
+                       }
+                       if (block_end > to) {
+                               memset(kaddr + to, 0, block_end - to);
+                               dirtied_it = 1;
+                       }
+                       flush_dcache_page(page);
+                       kunmap_atomic(kaddr, KM_USER0);
+                       continue;
+               }
+               if (buffer_uptodate(&map_bh))
+                       continue;       /* reiserfs does this */
+               if (block_start < from || block_end > to) {
+                       struct buffer_head *bh = alloc_buffer_head();
+
+                       if (!bh) {
+                               ret = -ENOMEM;
+                               goto failed;
+                       }
+                       bh->b_state = map_bh.b_state;
+                       atomic_set(&bh->b_count, 0);
+                       bh->b_this_page = 0;
+                       bh->b_page = page;
+                       bh->b_blocknr = map_bh.b_blocknr;
+                       bh->b_size = blocksize;
+                       bh->b_data = (char *)block_start;
+                       bh->b_bdev = map_bh.b_bdev;
+                       bh->b_private = NULL;
+                       read_bh[nr_reads++] = bh;
+               }
+       }
+
+       if (nr_reads) {
+               ll_rw_block(READ, nr_reads, read_bh);
+               for (i = 0; i < nr_reads; i++) {
+                       wait_on_buffer(read_bh[i]);
+                       if (!buffer_uptodate(read_bh[i]))
+                               ret = -EIO;
+                       free_buffer_head(read_bh[i]);
+                       read_bh[i] = NULL;
+               }
+               if (ret)
+                       goto failed;
+       }
+
+       if (is_mapped_to_disk)
+               SetPageMappedToDisk(page);
+       SetPageUptodate(page);
+
+       /*
+        * Setting the page dirty here isn't necessary for the prepare_write
+        * function - commit_write will do that.  But if/when this function is
+        * used within the pagefault handler to ensure that all mmapped pages
+        * have backing space in the filesystem, we will need to dirty the page
+        * if its contents were altered.
+        */
+       if (dirtied_it)
+               set_page_dirty(page);
+
+       return 0;
+
+failed:
+       for (i = 0; i < nr_reads; i++) {
+               if (read_bh[i])
+                       free_buffer_head(read_bh[i]);
+       }
+
+       /*
+        * Error recovery is pretty slack.  Clear the page and mark it dirty
+        * so we'll later zero out any blocks which _were_ allocated.
+        */
+       kaddr = kmap_atomic(page, KM_USER0);
+       memset(kaddr, 0, PAGE_CACHE_SIZE);
+       kunmap_atomic(kaddr, KM_USER0);
+       SetPageUptodate(page);
+       set_page_dirty(page);
+       return ret;
+}
+EXPORT_SYMBOL(nobh_prepare_write);
+
+int nobh_commit_write(struct file *file, struct page *page,
+               unsigned from, unsigned to)
+{
+       struct inode *inode = page->mapping->host;
+       loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+       set_page_dirty(page);
+       if (pos > inode->i_size) {
+               inode->i_size = pos;
+               mark_inode_dirty(inode);
+       }
+       return 0;
+}
+EXPORT_SYMBOL(nobh_commit_write);
+
+/*
+ * This function assumes that ->prepare_write() uses nobh_prepare_write().
+ */
+int nobh_truncate_page(struct address_space *mapping, loff_t from)
+{
+       struct inode *inode = mapping->host;
+       unsigned blocksize = 1 << inode->i_blkbits;
+       pgoff_t index = from >> PAGE_CACHE_SHIFT;
+       unsigned offset = from & (PAGE_CACHE_SIZE-1);
+       unsigned to;
+       struct page *page;
+       struct address_space_operations *a_ops = mapping->a_ops;
+       char *kaddr;
+       int ret = 0;
+
+       if ((offset & (blocksize - 1)) == 0)
+               goto out;
+
+       ret = -ENOMEM;
+       page = grab_cache_page(mapping, index);
+       if (!page)
+               goto out;
+
+       to = (offset + blocksize) & ~(blocksize - 1);
+       ret = a_ops->prepare_write(NULL, page, offset, to);
+       if (ret == 0) {
+               kaddr = kmap_atomic(page, KM_USER0);
+               memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+               flush_dcache_page(page);
+               kunmap_atomic(kaddr, KM_USER0);
+               set_page_dirty(page);
+       }
+       unlock_page(page);
+       page_cache_release(page);
+out:
+       return ret;
+}
+EXPORT_SYMBOL(nobh_truncate_page);
+
 int block_truncate_page(struct address_space *mapping,
                        loff_t from, get_block_t *get_block)
 {
index e5de47ed3407d2ced445c1fc89600cd868c4ceb2..59b5921837cdedb76b81a36204866aa11b4c3117 100644 (file)
@@ -120,6 +120,7 @@ extern struct file_operations ext2_file_operations;
 
 /* inode.c */
 extern struct address_space_operations ext2_aops;
+extern struct address_space_operations ext2_nobh_aops;
 
 /* namei.c */
 extern struct inode_operations ext2_dir_inode_operations;
index 7fe83a0443869569f639e25cb216bb7af5b83cfc..29cccde53b74ef4a8bc033bc466f7717d121f242 100644 (file)
@@ -612,6 +612,13 @@ ext2_prepare_write(struct file *file, struct page *page,
        return block_prepare_write(page,from,to,ext2_get_block);
 }
 
+static int
+ext2_nobh_prepare_write(struct file *file, struct page *page,
+                       unsigned from, unsigned to)
+{
+       return nobh_prepare_write(page,from,to,ext2_get_block);
+}
+
 static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
 {
        return generic_block_bmap(mapping,block,ext2_get_block);
@@ -657,6 +664,18 @@ struct address_space_operations ext2_aops = {
        .writepages             = ext2_writepages,
 };
 
+struct address_space_operations ext2_nobh_aops = {
+       .readpage               = ext2_readpage,
+       .readpages              = ext2_readpages,
+       .writepage              = ext2_writepage,
+       .sync_page              = block_sync_page,
+       .prepare_write          = ext2_nobh_prepare_write,
+       .commit_write           = nobh_commit_write,
+       .bmap                   = ext2_bmap,
+       .direct_IO              = ext2_direct_IO,
+       .writepages             = ext2_writepages,
+};
+
 /*
  * Probably it should be a library function... search for first non-zero word
  * or memcmp with zero_page, whatever is better for particular architecture.
@@ -864,7 +883,11 @@ void ext2_truncate (struct inode * inode)
        iblock = (inode->i_size + blocksize-1)
                                        >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
 
-       block_truncate_page(inode->i_mapping, inode->i_size, ext2_get_block);
+       if (test_opt(inode->i_sb, NOBH))
+               nobh_truncate_page(inode->i_mapping, inode->i_size);
+       else
+               block_truncate_page(inode->i_mapping,
+                               inode->i_size, ext2_get_block);
 
        n = ext2_block_to_path(inode, iblock, offsets, NULL);
        if (n == 0)
@@ -1044,17 +1067,26 @@ void ext2_read_inode (struct inode * inode)
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext2_file_inode_operations;
                inode->i_fop = &ext2_file_operations;
-               inode->i_mapping->a_ops = &ext2_aops;
+               if (test_opt(inode->i_sb, NOBH))
+                       inode->i_mapping->a_ops = &ext2_nobh_aops;
+               else
+                       inode->i_mapping->a_ops = &ext2_aops;
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &ext2_dir_inode_operations;
                inode->i_fop = &ext2_dir_operations;
-               inode->i_mapping->a_ops = &ext2_aops;
+               if (test_opt(inode->i_sb, NOBH))
+                       inode->i_mapping->a_ops = &ext2_nobh_aops;
+               else
+                       inode->i_mapping->a_ops = &ext2_aops;
        } else if (S_ISLNK(inode->i_mode)) {
                if (ext2_inode_is_fast_symlink(inode))
                        inode->i_op = &ext2_fast_symlink_inode_operations;
                else {
                        inode->i_op = &ext2_symlink_inode_operations;
-                       inode->i_mapping->a_ops = &ext2_aops;
+                       if (test_opt(inode->i_sb, NOBH))
+                               inode->i_mapping->a_ops = &ext2_nobh_aops;
+                       else
+                               inode->i_mapping->a_ops = &ext2_aops;
                }
        } else {
                inode->i_op = &ext2_special_inode_operations;
index 9f0788e3f6ef7e06e5b0f78544500a7dac2a14fb..04489df5a2e5a8a7dc5d0362930e69759fda33ff 100644 (file)
@@ -127,7 +127,10 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode)
        if (!IS_ERR(inode)) {
                inode->i_op = &ext2_file_inode_operations;
                inode->i_fop = &ext2_file_operations;
-               inode->i_mapping->a_ops = &ext2_aops;
+               if (test_opt(inode->i_sb, NOBH))
+                       inode->i_mapping->a_ops = &ext2_nobh_aops;
+               else
+                       inode->i_mapping->a_ops = &ext2_aops;
                mark_inode_dirty(inode);
                err = ext2_add_nondir(dentry, inode);
        }
@@ -168,7 +171,10 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
        if (l > sizeof (EXT2_I(inode)->i_data)) {
                /* slow symlink */
                inode->i_op = &ext2_symlink_inode_operations;
-               inode->i_mapping->a_ops = &ext2_aops;
+               if (test_opt(inode->i_sb, NOBH))
+                       inode->i_mapping->a_ops = &ext2_nobh_aops;
+               else
+                       inode->i_mapping->a_ops = &ext2_aops;
                err = page_symlink(inode, symname, l);
                if (err)
                        goto out_fail;
@@ -222,7 +228,10 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 
        inode->i_op = &ext2_dir_inode_operations;
        inode->i_fop = &ext2_dir_operations;
-       inode->i_mapping->a_ops = &ext2_aops;
+       if (test_opt(inode->i_sb, NOBH))
+               inode->i_mapping->a_ops = &ext2_nobh_aops;
+       else
+               inode->i_mapping->a_ops = &ext2_aops;
 
        ext2_inc_count(inode);
 
index a0acf56bd78101b314d90906d833cb3b9cc7c284..23116c16e3ce9ddfda2001809f6914bcbb95f3d1 100644 (file)
@@ -391,6 +391,8 @@ static int parse_options (char * options,
                        set_opt (sbi->s_mount_opt, OLDALLOC);
                else if (!strcmp (this_char, "orlov"))
                        clear_opt (sbi->s_mount_opt, OLDALLOC);
+               else if (!strcmp (this_char, "nobh"))
+                       set_opt(sbi->s_mount_opt, NOBH);
                /* Silently ignore the quota options */
                else if (!strcmp (this_char, "grpquota")
                         || !strcmp (this_char, "noquota")
index 05622a8e515a8aed5f4b53b68035bde9827da6c8..8307f43f18b69691c51b89fd4fb0f902738dad14 100644 (file)
@@ -178,6 +178,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
        struct block_device *bdev = NULL;
        struct buffer_head bh;
        int length;
+       int fully_mapped = 1;
 
        if (page_has_buffers(page))
                goto confused;
@@ -194,6 +195,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
                }
 
                if (!buffer_mapped(&bh)) {
+                       fully_mapped = 0;
                        if (first_hole == blocks_per_page)
                                first_hole = page_block;
                        continue;
@@ -220,6 +222,8 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
                        unlock_page(page);
                        goto out;
                }
+       } else if (fully_mapped) {
+               SetPageMappedToDisk(page);
        }
 
        /*
index 8587dd6f714648b69933629b5668c58b2585447a..4e7a9bbf99ddf6cf9facd28ec41f4fca13769a7d 100644 (file)
@@ -190,6 +190,9 @@ sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
 int file_fsync(struct file *, struct dentry *, int);
+int nobh_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
+int nobh_commit_write(struct file *, struct page *, unsigned, unsigned);
+int nobh_truncate_page(struct address_space *, loff_t);
 
 #define OSYNC_METADATA (1<<0)
 #define OSYNC_DATA     (1<<1)
index 0d008cfb99a855ee488b44b637d132acd001a93f..d701ba88c68821728c302090cee6af85ea6679a7 100644 (file)
@@ -308,6 +308,7 @@ struct ext2_inode {
 #define EXT2_MOUNT_ERRORS_RO           0x0020  /* Remount fs ro on errors */
 #define EXT2_MOUNT_ERRORS_PANIC                0x0040  /* Panic on errors */
 #define EXT2_MOUNT_MINIX_DF            0x0080  /* Mimics the Minix statfs */
+#define EXT2_MOUNT_NOBH                        0x0100  /* No buffer_heads */
 #define EXT2_MOUNT_NO_UID32            0x0200  /* Disable 32-bit UIDs */
 #define EXT2_MOUNT_XATTR_USER          0x4000  /* Extended user attributes */
 #define EXT2_MOUNT_POSIX_ACL           0x8000  /* POSIX Access Control Lists */
index 44480d80952fab344e0b271c97d27b811571f9d7..f02449871c077d7e3cbafc384dc1a46f76d9a5ef 100644 (file)
@@ -70,6 +70,7 @@
 #define PG_chainlock           15      /* lock bit for ->pte_chain */
 
 #define PG_direct              16      /* ->pte_chain points directly at pte */
+#define PG_mappedtodisk                17      /* Has blocks allocated on-disk */
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -233,6 +234,10 @@ extern void get_full_page_state(struct page_state *ret);
 #define ClearPageDirect(page)          clear_bit(PG_direct, &(page)->flags)
 #define TestClearPageDirect(page)      test_and_clear_bit(PG_direct, &(page)->flags)
 
+#define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags)
+#define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
+#define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
+
 /*
  * The PageSwapCache predicate doesn't use a PG_flag at this time,
  * but it may again do so one day.
index d8921e02318ae098620896912d3897ded226f556..a810142b49c0dc2b7a4befe99b237e46579fc75d 100644 (file)
@@ -265,7 +265,7 @@ static void prep_new_page(struct page *page, int order)
 
        page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
                        1 << PG_referenced | 1 << PG_arch_1 |
-                       1 << PG_checked);
+                       1 << PG_checked | 1 << PG_mappedtodisk);
        set_page_refs(page, order);
 }
 
index 884b4e3930c2869710e7e40af3ff051ccdf6cf41..25843367c31ee38e958756dbe1760d6db40d5b01 100644 (file)
@@ -52,6 +52,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 
        clear_page_dirty(page);
        ClearPageUptodate(page);
+       ClearPageMappedToDisk(page);
        remove_from_page_cache(page);
        page_cache_release(page);       /* pagecache ref */
 }