From b1ad1f4efce23ad0801492c0d5ffa8c0aa6a8cdb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 21 Nov 2002 19:32:45 -0800 Subject: [PATCH] [PATCH] no-buffer-head ext2 option Implements a new set of block address_space_operations which will never attach buffer_heads to file pagecache. These can be turned on for ext2 with the `nobh' mount option. During write-intensive testing on a 7G machine, total buffer_head storage remained below 0.3 megabytes. And those buffer_heads are against ZONE_NORMAL pagecache and will be reclaimed by ZONE_NORMAL memory pressure. This work is, of course, a special for the huge highmem machines. Possibly it obsoletes the buffer_heads_over_limit stuff (which doesn't work terribly well), but that code is simple, and will provide relief for other filesystems. It should be noted that the nobh_prepare_write() function and the PageMappedToDisk() infrastructure is what is needed to solve the problem of user data corruption when the filesystem which backs a sparse MAP_SHARED mapping runs out of space. We can use this code in filemap_nopage() to ensure that all mapped pages have space allocated on-disk. Deliver SIGBUS on ENOSPC. This will require a new address_space op, I expect. --- fs/buffer.c | 197 ++++++++++++++++++++++++++++++++++++ fs/ext2/ext2.h | 1 + fs/ext2/inode.c | 40 +++++++- fs/ext2/namei.c | 15 ++- fs/ext2/super.c | 2 + fs/mpage.c | 4 + include/linux/buffer_head.h | 3 + include/linux/ext2_fs.h | 1 + include/linux/page-flags.h | 5 + mm/page_alloc.c | 2 +- mm/truncate.c | 1 + 11 files changed, 263 insertions(+), 8 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index a0c7f850482f..aacd45b8b260 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1964,6 +1964,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block) struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize, blocks; int nr, i; + int fully_mapped = 1; if (!PageLocked(page)) PAGE_BUG(page); @@ -1986,6 +1987,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block) continue; if (!buffer_mapped(bh)) { + fully_mapped = 0; if (iblock < lblock) { if (get_block(inode, iblock, bh, 0)) SetPageError(page); @@ -2008,6 +2010,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block) arr[nr++] = bh; } while (i++, iblock++, (bh = bh->b_this_page) != head); + if (fully_mapped) + SetPageMappedToDisk(page); + if (!nr) { /* * All buffers are uptodate - we can set the page uptodate @@ -2204,6 +2209,198 @@ int generic_commit_write(struct file *file, struct page *page, return 0; } +/* + * On entry, the page is fully not uptodate. + * On exit the page is fully uptodate in the areas outside (from,to) + */ +int nobh_prepare_write(struct page *page, unsigned from, unsigned to, + get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + struct buffer_head map_bh; + struct buffer_head *read_bh[MAX_BUF_PER_PAGE]; + unsigned block_in_page; + unsigned block_start; + sector_t block_in_file; + char *kaddr; + int nr_reads = 0; + int i; + int ret = 0; + int is_mapped_to_disk = 1; + int dirtied_it = 0; + + if (PageMappedToDisk(page)) + return 0; + + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + map_bh.b_page = page; + + /* + * We loop across all blocks in the page, whether or not they are + * part of the affected region. This is so we can discover if the + * page is fully mapped-to-disk. + */ + for (block_start = 0, block_in_page = 0; + block_start < PAGE_CACHE_SIZE; + block_in_page++, block_start += blocksize) { + unsigned block_end = block_start + blocksize; + int create; + + map_bh.b_state = 0; + create = 1; + if (block_start >= to) + create = 0; + ret = get_block(inode, block_in_file + block_in_page, + &map_bh, create); + if (ret) + goto failed; + if (!buffer_mapped(&map_bh)) + is_mapped_to_disk = 0; + if (buffer_new(&map_bh)) + unmap_underlying_metadata(map_bh.b_bdev, + map_bh.b_blocknr); + if (PageUptodate(page)) + continue; + if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) { + kaddr = kmap_atomic(page, KM_USER0); + if (block_start < from) { + memset(kaddr+block_start, 0, from-block_start); + dirtied_it = 1; + } + if (block_end > to) { + memset(kaddr + to, 0, block_end - to); + dirtied_it = 1; + } + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + continue; + } + if (buffer_uptodate(&map_bh)) + continue; /* reiserfs does this */ + if (block_start < from || block_end > to) { + struct buffer_head *bh = alloc_buffer_head(); + + if (!bh) { + ret = -ENOMEM; + goto failed; + } + bh->b_state = map_bh.b_state; + atomic_set(&bh->b_count, 0); + bh->b_this_page = 0; + bh->b_page = page; + bh->b_blocknr = map_bh.b_blocknr; + bh->b_size = blocksize; + bh->b_data = (char *)block_start; + bh->b_bdev = map_bh.b_bdev; + bh->b_private = NULL; + read_bh[nr_reads++] = bh; + } + } + + if (nr_reads) { + ll_rw_block(READ, nr_reads, read_bh); + for (i = 0; i < nr_reads; i++) { + wait_on_buffer(read_bh[i]); + if (!buffer_uptodate(read_bh[i])) + ret = -EIO; + free_buffer_head(read_bh[i]); + read_bh[i] = NULL; + } + if (ret) + goto failed; + } + + if (is_mapped_to_disk) + SetPageMappedToDisk(page); + SetPageUptodate(page); + + /* + * Setting the page dirty here isn't necessary for the prepare_write + * function - commit_write will do that. But if/when this function is + * used within the pagefault handler to ensure that all mmapped pages + * have backing space in the filesystem, we will need to dirty the page + * if its contents were altered. + */ + if (dirtied_it) + set_page_dirty(page); + + return 0; + +failed: + for (i = 0; i < nr_reads; i++) { + if (read_bh[i]) + free_buffer_head(read_bh[i]); + } + + /* + * Error recovery is pretty slack. Clear the page and mark it dirty + * so we'll later zero out any blocks which _were_ allocated. + */ + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, 0, PAGE_CACHE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + SetPageUptodate(page); + set_page_dirty(page); + return ret; +} +EXPORT_SYMBOL(nobh_prepare_write); + +int nobh_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + set_page_dirty(page); + if (pos > inode->i_size) { + inode->i_size = pos; + mark_inode_dirty(inode); + } + return 0; +} +EXPORT_SYMBOL(nobh_commit_write); + +/* + * This function assumes that ->prepare_write() uses nobh_prepare_write(). + */ +int nobh_truncate_page(struct address_space *mapping, loff_t from) +{ + struct inode *inode = mapping->host; + unsigned blocksize = 1 << inode->i_blkbits; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned to; + struct page *page; + struct address_space_operations *a_ops = mapping->a_ops; + char *kaddr; + int ret = 0; + + if ((offset & (blocksize - 1)) == 0) + goto out; + + ret = -ENOMEM; + page = grab_cache_page(mapping, index); + if (!page) + goto out; + + to = (offset + blocksize) & ~(blocksize - 1); + ret = a_ops->prepare_write(NULL, page, offset, to); + if (ret == 0) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + } + unlock_page(page); + page_cache_release(page); +out: + return ret; +} +EXPORT_SYMBOL(nobh_truncate_page); + int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) { diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index e5de47ed3407..59b5921837cd 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -120,6 +120,7 @@ extern struct file_operations ext2_file_operations; /* inode.c */ extern struct address_space_operations ext2_aops; +extern struct address_space_operations ext2_nobh_aops; /* namei.c */ extern struct inode_operations ext2_dir_inode_operations; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 7fe83a044386..29cccde53b74 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -612,6 +612,13 @@ ext2_prepare_write(struct file *file, struct page *page, return block_prepare_write(page,from,to,ext2_get_block); } +static int +ext2_nobh_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + return nobh_prepare_write(page,from,to,ext2_get_block); +} + static sector_t ext2_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,ext2_get_block); @@ -657,6 +664,18 @@ struct address_space_operations ext2_aops = { .writepages = ext2_writepages, }; +struct address_space_operations ext2_nobh_aops = { + .readpage = ext2_readpage, + .readpages = ext2_readpages, + .writepage = ext2_writepage, + .sync_page = block_sync_page, + .prepare_write = ext2_nobh_prepare_write, + .commit_write = nobh_commit_write, + .bmap = ext2_bmap, + .direct_IO = ext2_direct_IO, + .writepages = ext2_writepages, +}; + /* * Probably it should be a library function... search for first non-zero word * or memcmp with zero_page, whatever is better for particular architecture. @@ -864,7 +883,11 @@ void ext2_truncate (struct inode * inode) iblock = (inode->i_size + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); - block_truncate_page(inode->i_mapping, inode->i_size, ext2_get_block); + if (test_opt(inode->i_sb, NOBH)) + nobh_truncate_page(inode->i_mapping, inode->i_size); + else + block_truncate_page(inode->i_mapping, + inode->i_size, ext2_get_block); n = ext2_block_to_path(inode, iblock, offsets, NULL); if (n == 0) @@ -1044,17 +1067,26 @@ void ext2_read_inode (struct inode * inode) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext2_file_inode_operations; inode->i_fop = &ext2_file_operations; - inode->i_mapping->a_ops = &ext2_aops; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext2_dir_inode_operations; inode->i_fop = &ext2_dir_operations; - inode->i_mapping->a_ops = &ext2_aops; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; } else if (S_ISLNK(inode->i_mode)) { if (ext2_inode_is_fast_symlink(inode)) inode->i_op = &ext2_fast_symlink_inode_operations; else { inode->i_op = &ext2_symlink_inode_operations; - inode->i_mapping->a_ops = &ext2_aops; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; } } else { inode->i_op = &ext2_special_inode_operations; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 9f0788e3f6ef..04489df5a2e5 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -127,7 +127,10 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode) if (!IS_ERR(inode)) { inode->i_op = &ext2_file_inode_operations; inode->i_fop = &ext2_file_operations; - inode->i_mapping->a_ops = &ext2_aops; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; mark_inode_dirty(inode); err = ext2_add_nondir(dentry, inode); } @@ -168,7 +171,10 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, if (l > sizeof (EXT2_I(inode)->i_data)) { /* slow symlink */ inode->i_op = &ext2_symlink_inode_operations; - inode->i_mapping->a_ops = &ext2_aops; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; err = page_symlink(inode, symname, l); if (err) goto out_fail; @@ -222,7 +228,10 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) inode->i_op = &ext2_dir_inode_operations; inode->i_fop = &ext2_dir_operations; - inode->i_mapping->a_ops = &ext2_aops; + if (test_opt(inode->i_sb, NOBH)) + inode->i_mapping->a_ops = &ext2_nobh_aops; + else + inode->i_mapping->a_ops = &ext2_aops; ext2_inc_count(inode); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index a0acf56bd781..23116c16e3ce 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -391,6 +391,8 @@ static int parse_options (char * options, set_opt (sbi->s_mount_opt, OLDALLOC); else if (!strcmp (this_char, "orlov")) clear_opt (sbi->s_mount_opt, OLDALLOC); + else if (!strcmp (this_char, "nobh")) + set_opt(sbi->s_mount_opt, NOBH); /* Silently ignore the quota options */ else if (!strcmp (this_char, "grpquota") || !strcmp (this_char, "noquota") diff --git a/fs/mpage.c b/fs/mpage.c index 05622a8e515a..8307f43f18b6 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -178,6 +178,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, struct block_device *bdev = NULL; struct buffer_head bh; int length; + int fully_mapped = 1; if (page_has_buffers(page)) goto confused; @@ -194,6 +195,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, } if (!buffer_mapped(&bh)) { + fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; continue; @@ -220,6 +222,8 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, unlock_page(page); goto out; } + } else if (fully_mapped) { + SetPageMappedToDisk(page); } /* diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 8587dd6f7146..4e7a9bbf99dd 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -190,6 +190,9 @@ sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int generic_commit_write(struct file *, struct page *, unsigned, unsigned); int block_truncate_page(struct address_space *, loff_t, get_block_t *); int file_fsync(struct file *, struct dentry *, int); +int nobh_prepare_write(struct page*, unsigned, unsigned, get_block_t*); +int nobh_commit_write(struct file *, struct page *, unsigned, unsigned); +int nobh_truncate_page(struct address_space *, loff_t); #define OSYNC_METADATA (1<<0) #define OSYNC_DATA (1<<1) diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h index 0d008cfb99a8..d701ba88c688 100644 --- a/include/linux/ext2_fs.h +++ b/include/linux/ext2_fs.h @@ -308,6 +308,7 @@ struct ext2_inode { #define EXT2_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */ #define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ #define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ +#define EXT2_MOUNT_NOBH 0x0100 /* No buffer_heads */ #define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */ #define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ #define EXT2_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 44480d80952f..f02449871c07 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -70,6 +70,7 @@ #define PG_chainlock 15 /* lock bit for ->pte_chain */ #define PG_direct 16 /* ->pte_chain points directly at pte */ +#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -233,6 +234,10 @@ extern void get_full_page_state(struct page_state *ret); #define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) #define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags) +#define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) +#define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) +#define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) + /* * The PageSwapCache predicate doesn't use a PG_flag at this time, * but it may again do so one day. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d8921e02318a..a810142b49c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -265,7 +265,7 @@ static void prep_new_page(struct page *page, int order) page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_checked); + 1 << PG_checked | 1 << PG_mappedtodisk); set_page_refs(page, order); } diff --git a/mm/truncate.c b/mm/truncate.c index 884b4e3930c2..25843367c31e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -52,6 +52,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) clear_page_dirty(page); ClearPageUptodate(page); + ClearPageMappedToDisk(page); remove_from_page_cache(page); page_cache_release(page); /* pagecache ref */ } -- 2.39.5