From: NeilBrown <neilb@suse.de>
Date: Fri, 4 Mar 2011 23:44:02 +0000 (+1100)
Subject: Start implementation of separate access-time file.
X-Git-Url: http://git.neil.brown.name/?a=commitdiff_plain;h=99d9894fa6ba269748eb6ea94f33067bad9915ce;p=LaFS.git

Start implementation of separate access-time file.

Add README documentation,
load file at mount time, release at unmount.

Signed-off-by: NeilBrown <neilb@suse.de>
---

diff --git a/README b/README
index 6c3d454..21e7336 100644
--- a/README
+++ b/README
@@ -7041,3 +7041,113 @@ WritePhase - what is that all about?
    We should allow one 'page' for each metadatum, which probably meanss
    32K.
    So we should allow all state blocks to be near the start.
+
+01mar2011 - Autumn arrives.
+
+  Time to add handling of 'atime' and non-logged files.
+
+  The idea is to have a separate file for storing only 'atime'
+  This is separate from the inode file because the volatility of the data
+  is very different and one of the principle of log-structured-fs is that
+  differently volatile should be kept separate.
+
+  This does mean that an inode lookup requires getting data from two files,
+  but it is hopped that the 'atime' file will mostly be in cache as each
+  block contains the atime for lots of different inodes.
+
+  The atime file contains 2 bytes for each inode, so with a block size of 4K,
+  each block would hold info for 2048 inodes.  1 million inodes would require
+  2 megabytes.
+
+  The 16bits are treated as a positive floating point number which
+  gets added to the atime stored in the inode.  The lower 5 bits are
+  the exponent, the remaining 11 bits are mantissa.  Though there is a
+  little complexity in interpreting the exponent.
+     If the exponent is 0, the mantissa is and used as milliseconds -
+       so shift left 5 and multiply by 1000000 for nanoseconds.
+       The smallest change that can be recorded in 1 millisecond.
+       and values up to (2^11-1) milliseconds - or 2seconds can be stored.
+     If the exponent is 1 to 10, the mantissa has a '1' appended as a
+       new msb, and is shifted by the exponent-1 and then treated as milliseconds.
+       This ranges up to 2^(12+9) milliseconds or 30 minutes, where
+       the granularity will be 2^9 millisecs or 0.5 seconds
+
+  
+     For exponents from 11 up to 31 we add the 1 msb and treat
+       the number as seconds after shifting (e-11).  So at e==31,
+       we shift a number that is
+       up to 4095 by 20 to get nearly 2^32 seconds or 136 years.
+       At this point the granularity is 2^20 seconds or 12 days.
+
+
+   So overall we can update the atime for 136 years without needing to
+   update the inode, and can record differences of 1msec for the first
+   couple of seconds, then gradually less granularity until we are
+   down to one second an hour after the last change, and 4 hours a
+   year later.
+
+   To convert a number of seconds to this format:
+
+   If >= 2048 seconds, we shift down until less than 4096 seconds
+   counting the shift.  We add 11 to that number to form exponent,
+   and shift the resulting mantissa up 5, or with exponent, and mask
+   out bit 16.
+
+   Otherwise we convert to milliseconds (divide nanno by 1000000 and
+   multiply seconds by 1000, and add). Then if < 2048, we shift up by
+   5 leaving a zero exponent and use that.
+
+   Otherwise we shift down until < 4096 counting shifts, add 1 to the
+   shift to form an exponent, and combine with mantissa as above.
+   
+
+   So that is the format - how do we implement it?
+
+   We don't want to expose to user-space numbers that we cannot store.
+   So any 'utimes' call updates the inode directly can clear the value
+   in the atime file.  Only updates due to accesses go to the atimes
+   file.
+   We define a 'getattr' function which looks at the atime stored in
+   the vfs inode and if it has changed we need to deal with it.
+    - if the inode is still dirty we simply update the lafs inode
+      and use the number as-is, clearing the atimes entry
+    - else we subtract the stored atime from the new atime.  If this
+      is negative or exceeds 136 years we mark the inode dirty and
+      store it there.  It we cannot mark the inode dirty for some
+      reason we just store all 1s in the atime file.
+
+    The same operation is needed when dirty_inode is called to make
+    sure atime updates get saved even when no getattr is called.
+
+    As we always need to be able to update the atime file, it needs to
+    be permanently pinned whenever an inode is read in.  For
+    non-logged files this should be cheap but we must do it anyway as
+    the file might not be non-logged.
+    So we need to keep a permanent reference to each block while the
+    inode is loaded.  That can keep it pinned.
+
+
+    We don't want updates to the atime file to be flushed in any great
+    hurry, especially if it is a logged file.  We would be quite happy
+    to only write at 'unmount' and probably 'sync'.
+    So we want to stop the pages from appearing dirty in the page
+    cache (PAGECACHE_TAG_DIRTY), and the inode from appearing dirty
+    (I_DIRTY).
+    We can still keep them dirty in lafs metadata so if release_page
+    is called we can schedule a write out then.
+    
+
+   So some steps:
+
+    1/ load atime file at mount time - there is one for each
+      filesystem.  It has inum of 3 and type of TypeAccesstime (6).
+      Also release it on unmount.
+
+    2/ loading an inode must take a ref to the block in the atime file
+      if it exists.  A new inode flag records if this has happened.
+      Unless mounted noatime, we pin the block and reserve space.
+
+    3/ getattr and dirty_inode must resolve any issues with the
+       atime.  So lafs_inode probably needs an extra field to be able
+       to check for changes
+
diff --git a/inode.c b/inode.c
index 52659c3..fd8ab77 100644
--- a/inode.c
+++ b/inode.c
@@ -304,6 +304,7 @@ lafs_import_inode(struct inode *ino, struct datablock *b)
 			= i->quota_inodes[2] = NULL;
 		nlen = li->metadata_size - offsetof(struct la_inode,
 						    metadata[0].fs.name);
+		i->accesstime = NULL;
 		if (i->name)
 			kfree(i->name);
 		if (nlen == 0)
diff --git a/roll.c b/roll.c
index 39cee18..3e3a6e3 100644
--- a/roll.c
+++ b/roll.c
@@ -871,6 +871,17 @@ lafs_mount(struct fs *fs)
 		fs->cleaner.seg[d].chead = p;
 		INIT_LIST_HEAD(&fs->cleaner.seg[d].cleaning);
 	}
+
+	ino = lafs_iget(fs->prime_sb, 3, SYNC);
+	if (!IS_ERR(ino)) {
+		if (LAFSI(ino)->type != TypeAccessTime) {
+			iput(ino);
+			err = -EINVAL;
+		} else
+			LAFSI(fs->ss[0].root)->md.fs.accesstime = ino;
+	} else if (PTR_ERR(ino) != -ENOENT)
+		err = PTR_ERR(ino);
+
 err:
 	putdref(b, MKREF(mount));
 	return err;
diff --git a/state.h b/state.h
index f65fb08..ecbc996 100644
--- a/state.h
+++ b/state.h
@@ -604,6 +604,7 @@ struct lafs_inode {
 			u32	inodes_used;
 			u32	quota_inums[3];
 			struct inode *quota_inodes[3];
+			struct inode *accesstime;
 			char	*name;
 		} fs;
 		struct inodemap_md {
diff --git a/super.c b/super.c
index f96c346..7429bc9 100644
--- a/super.c
+++ b/super.c
@@ -755,6 +755,12 @@ static void lafs_kill_sb(struct super_block *sb)
 		   fs->scan.done == 1 &&
 		   fs->cleaner.active == 0);
 
+	if (LAFSI(fs->ss[0].root)->md.fs.accesstime) {
+		struct inode *i = LAFSI(fs->ss[0].root)->md.fs.accesstime;
+		LAFSI(fs->ss[0].root)->md.fs.accesstime = NULL;
+		iput(i);
+	}
+
 	kill_anon_super(fs->prime_sb);
 
 	bdi_destroy(&fs->bdi);
@@ -1041,6 +1047,18 @@ struct super_block *lafs_get_subset_sb(struct inode *ino)
 				iput(imapfile);
 		}
 
+		if (!err) {
+			struct inode *atime = lafs_iget(sb, 3, SYNC);
+			if (!IS_ERR(atime)) {
+				if (LAFSI(atime)->type != TypeAccessTime) {
+					iput(atime);
+					err = -EINVAL;
+				} else
+					LAFSI(ino)->md.fs.accesstime = atime;
+			} else if (PTR_ERR(atime) != -ENOENT)
+				err = PTR_ERR(ino);
+		}
+
 		if (!err) {
 			sb->s_op = fs->prime_sb->s_op;
 			sb->s_flags |= MS_ACTIVE;
@@ -1136,6 +1154,7 @@ lafs_get_subset(struct file_system_type *fs_type,
 			md->quota_inodes[0] = NULL;
 			md->quota_inodes[1] = NULL;
 			md->quota_inodes[2] = NULL;
+			md->accesstime = NULL;
 			md->name = NULL;
 			lafs_dirty_dblock(inodb);
 			lafs_dirty_inode(ino);
@@ -1171,6 +1190,10 @@ out_noput:
 static void lafs_kill_subset(struct super_block *sb)
 {
 	struct sb_key *k = sb->s_fs_info;
+	if (LAFSI(k->root)->md.fs.accesstime) {
+		iput(LAFSI(k->root)->md.fs.accesstime);
+		LAFSI(k->root)->md.fs.accesstime = NULL;
+	}
 	kill_anon_super(sb);
 	iput(k->root);
 	deactivate_super(k->fs->prime_sb);