--- /dev/null
+
+Making Filesystems Exportable
+=============================
+
+Most filesystem operations require a dentry (or two) as a starting
+point. Local applications have a reference-counted hold on suitable
+dentrys via open file descriptors or cwd/root. However remote
+applications that access a filesystem via a remote filesystem protocol
+such as NFS may not be able to hold such a reference, and so need a
+different way to refer to a particular dentry. As the alternative
+form of reference needs to be stable across renames, truncates, and
+server-reboot (among other things, though these tend to be the most
+problematic), there is no simple answer like 'filename'.
+
+The mechanism discussed here allows each filesystem implementation to
+specify how to generate an opaque (out side of the filesystem) byte
+string for any dentry, and how to find an appropriate dentry for any
+given opaque byte string.
+This byte string will be called a "filehandle fragment" as it
+corresponds to part of an NFS filehandle.
+
+A filesystem which supports the mapping between filehandle fragments
+and dentrys will be termed "exportable".
+
+
+
+Dcache Issues
+-------------
+
+The dcache normally contains a proper prefix of any given filesystem
+tree. This means that if any filesystem object is in the dcache, then
+all of the ancestors of that filesystem object are also in the dcache.
+As normal access is by filename this prefix is created naturally and
+maintained easily (by each object maintaining a reference count on
+it's parent).
+
+However when objects are included into the dcache by interpreting a
+filehandle fragment, there is no automatic creation of a path prefix
+for the object. This leads to two related but distinct features of
+the dcache that are not needed for normal filesystem access.
+
+1/ The dcache must sometimes contain objects that are not part of the
+ proper prefix. i.e that are not connected to the root.
+2/ The dcache must be prepared for a newly found (via ->lookup) directory
+ to already have a (non-connected) dentry, and must be able to move
+ that dentry into place (based on the parent and name in the
+ ->lookup). This is particuarly needed for directories as
+ it is a dcache invarient that directories only have one dentry.
+
+To implement these features, the dcache has:
+
+a/ A dentry flag DCACHE_DISCONNECTED which is set on
+ and dentry that might not be part of the proper prefix.
+ This is set when anonymous dentries are created, and cleared when a
+ dentry is noticed to be a child on a dentry which is in the proper
+ prefix.
+
+b/ A per-superblock list "s_anon" of dentries which are the roots of
+ subtrees that are not in the proper prefix. These dentries, as
+ well as the proper prefix, need to be released at unmount time. As
+ these dentries will not be hashed, they are linked together on the
+ d_hash list_head.
+
+c/ Helper routines to allocate anonymous dentries, and to help attach
+ loose directory dentries at lookup time. They are:
+ d_alloc_anon(inode) will return a dentry for the given inode.
+ If the inode already has a dentry, one of those is returned.
+ If it doesn't, a new anonymous (IS_ROOT and
+ DCACHE_DISCONNECTED) dentry is allocated and attached.
+ In the case of a directory, care is taken that only one dentry
+ can ever be attached.
+ d_splice_alias(inode, dentry) will make sure that there is a
+ dentry with the same name and parent as the given dentry, and
+ which refers to the given inode.
+ If the inode is a directory and already has a dentry, then that
+ dentry is d_moved over the given dentry.
+ If the passed dentry gets attached, care is taken that this is
+ mutually exclusive to a d_alloc_anon operation.
+ If the passed dentry is used, NULL is returned, else the used
+ dentry is returned. This corresponds to the calling pattern of
+ ->lookup.
+
+
+Filesystem Issues
+-----------------
+
+For a filesystem to be exportable it must:
+
+ 1/ provide the filehandle fragment routines described below
+ (later).
+ 2/ make sure that d_splice_alias is used rather than d_add
+ when ->lookup finds an inode for a given parent and name.
+ Typically the ->lookup routine will end:
+ if (inode)
+ return d_splice(inode, dentry);
+ d_add(dentry, inode);
+ return NULL;
+ }
+
+
* return it. Otherwise return NULL. Notice that if inode is a directory
* there can be only one alias and it can be unhashed only if it has
* no children.
+ *
+ * If the inode has a DCACHE_DISCONNECTED alias, then prefer
+ * any other hashed alias over that one.
*/
struct dentry * d_find_alias(struct inode *inode)
{
struct list_head *head, *next, *tmp;
- struct dentry *alias;
+ struct dentry *alias, *discon_alias=NULL;
spin_lock(&dcache_lock);
head = &inode->i_dentry;
next = tmp->next;
alias = list_entry(tmp, struct dentry, d_alias);
if (!list_empty(&alias->d_hash)) {
- __dget_locked(alias);
- spin_unlock(&dcache_lock);
- return alias;
+ if (alias->d_flags & DCACHE_DISCONNECTED)
+ discon_alias = alias;
+ else {
+ __dget_locked(alias);
+ spin_unlock(&dcache_lock);
+ return alias;
+ }
}
}
+ if (discon_alias)
+ __dget_locked(discon_alias);
spin_unlock(&dcache_lock);
- return NULL;
+ return discon_alias;
}
/*
prune_dcache(found);
}
+/**
+ * shrink_dcache_anon - further prune the cache
+ * @head: head of d_hash list of dentries to prune
+ *
+ * Prune the dentries that are anonymous
+ *
+ */
+void shrink_dcache_anon(struct list_head *head)
+{
+ struct list_head *lp;
+ int found;
+ do {
+ found = 0;
+ spin_lock(&dcache_lock);
+ list_for_each(lp, head) {
+ struct dentry *this = list_entry(lp, struct dentry, d_hash);
+ if (!atomic_read(&this->d_count)) {
+ list_del(&this->d_lru);
+ list_add_tail(&this->d_lru, &dentry_unused);
+ found++;
+ }
+ }
+ spin_unlock(&dcache_lock);
+ prune_dcache(found);
+ } while(found);
+}
+
/*
* This is called from kswapd when we think we need some
* more memory, but aren't really sure how much. So we
return dentry_hashtable + (hash & D_HASHMASK);
}
+/**
+ * d_alloc_anon - allocate an anonymous dentry
+ * @inode: inode to allocate the dentry for
+ *
+ * This is similar to d_alloc_root. It is used by filesystems when
+ * creating a dentry for a given inode, often in the process of
+ * mapping a filehandle to a dentry. The returned dentry may be
+ * anonymous, or may have a full name (if the inode was already
+ * in the cache). The file system may need to make further
+ * efforts to connect this dentry into the dcache properly.
+ *
+ * When called on a directory inode, we must ensure that
+ * the inode only ever has one dentry. If a dentry is
+ * found, that is returned instead of allocating a new one.
+ *
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry. If %NULL is returned (indicating kmalloc failure),
+ * the reference on the inode has not been released.
+ */
+
+struct dentry * d_alloc_anon(struct inode *inode)
+{
+ struct dentry *tmp;
+ struct dentry *res;
+
+ if ((res = d_find_alias(inode))) {
+ iput(inode);
+ return res;
+ }
+
+ tmp = d_alloc(NULL, &(const struct qstr) {"",0,0});
+ tmp->d_parent = tmp; /* make sure dput doesn't croak */
+
+ spin_lock(&dcache_lock);
+ if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
+ /* A directory can only have one dentry.
+ * This (now) has one, so use it.
+ */
+ res = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+ __dget_locked(res);
+ } else {
+ /* attach a disconnected dentry */
+ res = tmp;
+ tmp = NULL;
+ if (res) {
+ res->d_sb = inode->i_sb;
+ res->d_parent = res;
+ res->d_inode = inode;
+ res->d_flags |= DCACHE_DISCONNECTED;
+ list_add(&res->d_alias, &inode->i_dentry);
+ list_add(&res->d_hash, &inode->i_sb->s_anon);
+ }
+ inode = NULL; /* don't drop reference */
+ }
+ spin_unlock(&dcache_lock);
+
+ if (inode)
+ iput(inode);
+ if (tmp)
+ dput(tmp);
+ return res;
+}
+
+
+/**
+ * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * @inode: the inode which may have a disconnected dentry
+ * @dentry: a negative dentry which we want to point to the inode.
+ *
+ * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
+ * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
+ * and return it, else simply d_add the inode to the dentry and return NULL.
+ *
+ * This is (will be) needed in the lookup routine of any filesystem that is exportable
+ * (via knfsd) so that we can build dcache paths to directories effectively.
+ *
+ * If a dentry was found and moved, then it is returned. Otherwise NULL
+ * is returned. This matches the expected return value of ->lookup.
+ *
+ */
+struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+{
+ struct dentry *new = NULL;
+
+ if (inode && S_ISDIR(inode->i_mode)) {
+ spin_lock(&dcache_lock);
+ if (!list_empty(&inode->i_dentry)) {
+ new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+ __dget_locked(new);
+ spin_unlock(&dcache_lock);
+ d_rehash(dentry);
+ d_move(new, dentry);
+ iput(inode);
+ } else {
+ /* d_instantiate takes dcache_lock, so we do it by hand */
+ list_add(&dentry->d_alias, &inode->i_dentry);
+ dentry->d_inode = inode;
+ spin_unlock(&dcache_lock);
+ d_rehash(dentry);
+ }
+ } else
+ d_add(dentry, inode);
+ return new;
+}
+
+
/**
* d_lookup - search for a dentry
* @parent: parent dentry
list_del(&dentry->d_child);
list_del(&target->d_child);
- /* Switch the parents and the names.. */
+ /* Switch the names.. */
switch_names(dentry, target);
- write_lock(&dparent_lock);
- do_switch(dentry->d_parent, target->d_parent);
- write_unlock(&dparent_lock);
do_switch(dentry->d_name.len, target->d_name.len);
do_switch(dentry->d_name.hash, target->d_name.hash);
+ /* ... and switch the parents */
+ write_lock(&dparent_lock);
+ if (IS_ROOT(dentry)) {
+ dentry->d_parent = target->d_parent;
+ target->d_parent = target;
+ INIT_LIST_HEAD(&target->d_child);
+ } else {
+ do_switch(dentry->d_parent, target->d_parent);
+
+ /* And add them back to the (new) parent lists */
+ list_add(&target->d_child, &target->d_parent->d_subdirs);
+ }
+ write_unlock(&dparent_lock);
- /* And add them back to the (new) parent lists */
- list_add(&target->d_child, &target->d_parent->d_subdirs);
list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
spin_unlock(&dcache_lock);
}
int len, int fhtype, int parent)
{
struct inode *inode = NULL;
- struct list_head *lp;
struct dentry *result;
if (fhtype != 3)
/* now to find a dentry.
* If possible, get a well-connected one
- *
- * Given the way that we found the inode, it *MUST* be
- * well-connected, but it is easiest to just copy the
- * code.
*/
- spin_lock(&dcache_lock);
- list_for_each(lp, &inode->i_dentry) {
- result = list_entry(lp,struct dentry, d_alias);
- if (! (result->d_flags & DCACHE_NFSD_DISCONNECTED)) {
- dget_locked(result);
- result->d_vfs_flags |= DCACHE_REFERENCED;
- spin_unlock(&dcache_lock);
- iput(inode);
- return result;
- }
- }
- spin_unlock(&dcache_lock);
- result = d_alloc_root(inode);
+ result = d_alloc_anon(inode);
if (result == NULL) {
iput(inode);
return ERR_PTR(-ENOMEM);
}
- result->d_flags |= DCACHE_NFSD_DISCONNECTED;
+ result->d_vfs_flags |= DCACHE_REFERENCED;
return result;
-
}
}
d_instantiate(dentry, inode);
- dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; /* NFS hack */
+ dentry->d_flags |= DCACHE_DISCONNECTED; /* NFS hack */
EXIT;
return NULL;
* of 0 means "accept any"
*/
struct inode *inode;
- struct list_head *lp;
struct dentry *result;
if (ino == 0)
return ERR_PTR(-ESTALE);
/* now to find a dentry.
* If possible, get a well-connected one
*/
- spin_lock(&dcache_lock);
- list_for_each(lp, &inode->i_dentry) {
- result = list_entry(lp,struct dentry, d_alias);
- if (! (result->d_flags & DCACHE_NFSD_DISCONNECTED)) {
- dget_locked(result);
- result->d_vfs_flags |= DCACHE_REFERENCED;
- spin_unlock(&dcache_lock);
- iput(inode);
- return result;
- }
- }
- spin_unlock(&dcache_lock);
- result = d_alloc_root(inode);
- if (result == NULL) {
+ result = d_alloc_anon(inode);
+ if (!result) {
iput(inode);
return ERR_PTR(-ENOMEM);
}
- result->d_flags |= DCACHE_NFSD_DISCONNECTED;
+ result->d_vfs_flags |= DCACHE_REFERENCED;
return result;
}
#ifdef NFSD_PARANOIA
if (!IS_ROOT(target))
printk("nfsd: d_splice with no-root target: %s/%s\n", parent->d_name.name, name->name);
- if (!(target->d_flags & DCACHE_NFSD_DISCONNECTED))
+ if (!(target->d_flags & DCACHE_DISCONNECTED))
printk("nfsd: d_splice with non-DISCONNECTED target: %s/%s\n", parent->d_name.name, name->name);
#endif
tdentry = d_alloc(parent, name);
return -ENOMEM;
d_move(target, tdentry);
- /* tdentry will have been made a "child" of target (the parent of target)
- * make it an IS_ROOT instead
- */
- spin_lock(&dcache_lock);
- list_del_init(&tdentry->d_child);
- tdentry->d_parent = tdentry;
- spin_unlock(&dcache_lock);
d_rehash(target);
dput(tdentry);
* the children are connected, but it must be a singluar (non-forking)
* branch
*/
- if (!(parent->d_flags & DCACHE_NFSD_DISCONNECTED)) {
+ if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
while (target) {
- target->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
+ target->d_flags &= ~DCACHE_DISCONNECTED;
parent = target;
spin_lock(&dcache_lock);
if (list_empty(&parent->d_subdirs))
#ifdef NFSD_PARANOIA
/* must be only child */
if (target->d_child.next != &parent->d_subdirs
- || target->d_child.prev != &parent->d_subdirs)
+ || target->d_child.prev != &parent->d_subdirs) {
printk("nfsd: d_splice found non-singular disconnected branch: %s/%s\n",
parent->d_name.name, target->d_name.name);
+ spin_unlock(&dcache_lock);
+ return 0;
+ }
#endif
}
spin_unlock(&dcache_lock);
pdentry = d_alloc_root(tdentry->d_inode);
if (pdentry) {
igrab(tdentry->d_inode);
- pdentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
+ pdentry->d_flags |= DCACHE_DISCONNECTED;
}
}
if (pdentry == NULL)
down(&sb->s_nfsd_free_path_sem);
result = nfsd_get_dentry(sb, datap, len, fhtype, 0);
if (IS_ERR(result)
- || !(result->d_flags & DCACHE_NFSD_DISCONNECTED)
+ || !(result->d_flags & DCACHE_DISCONNECTED)
|| (!S_ISDIR(result->d_inode->i_mode) && ! needpath)) {
up(&sb->s_nfsd_free_path_sem);
err = PTR_ERR(result);
if (IS_ERR(result))
goto err_out;
- if ((result->d_flags & DCACHE_NFSD_DISCONNECTED))
+ if ((result->d_flags & DCACHE_DISCONNECTED))
nfsdstats.fh_anon++;
return result;
}
dentry = dget(result);
}
- while(dentry->d_flags & DCACHE_NFSD_DISCONNECTED) {
+ while(dentry->d_flags & DCACHE_DISCONNECTED) {
/* LOOP INVARIANT */
/* haven't found a place in the tree yet, but we do have a free path
* from dentry down to result, and dentry is a directory.
}
#ifdef NFSD_PARANOIA
if (S_ISDIR(dentry->d_inode->i_mode) &&
- (dentry->d_flags & DCACHE_NFSD_DISCONNECTED)) {
+ (dentry->d_flags & DCACHE_DISCONNECTED)) {
printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
}
int len, int fhtype, int parent) {
struct cpu_key key ;
struct inode *inode = NULL ;
- struct list_head *lp;
struct dentry *result;
/* fhtype happens to reflect the number of u32s encoded.
/* now to find a dentry.
* If possible, get a well-connected one
*/
- spin_lock(&dcache_lock);
- list_for_each(lp, &inode->i_dentry) {
- result = list_entry(lp,struct dentry, d_alias);
- if (! (result->d_flags & DCACHE_NFSD_DISCONNECTED)) {
- dget_locked(result);
- result->d_vfs_flags |= DCACHE_REFERENCED;
- spin_unlock(&dcache_lock);
- iput(inode);
- return result;
- }
- }
- spin_unlock(&dcache_lock);
- result = d_alloc_root(inode);
+ result = d_alloc_anon(inode);
if (result == NULL) {
iput(inode);
return ERR_PTR(-ENOMEM);
}
- result->d_flags |= DCACHE_NFSD_DISCONNECTED;
+ result->d_vfs_flags |= DCACHE_REFERENCED;
return result;
}
INIT_LIST_HEAD(&s->s_locked_inodes);
INIT_LIST_HEAD(&s->s_files);
INIT_LIST_HEAD(&s->s_instances);
+ INIT_LIST_HEAD(&s->s_anon);
init_rwsem(&s->s_umount);
sema_init(&s->s_lock, 1);
down_write(&s->s_umount);
if (root) {
sb->s_root = NULL;
shrink_dcache_parent(root);
+ shrink_dcache_anon(&sb->s_anon);
dput(root);
fsync_super(sb);
lock_super(sb);
* renamed" and has to be
* deleted on the last dput()
*/
-#define DCACHE_NFSD_DISCONNECTED 0x0004 /* This dentry is not currently connected to the
+#define DCACHE_DISCONNECTED 0x0004 /* This dentry is not currently connected to the
* dcache tree. Its parent will either be itself,
* or will have this flag as well.
* If this dentry points to a directory, then
/* allocate/de-allocate */
extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
+extern struct dentry * d_alloc_anon(struct inode *);
+extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
extern void shrink_dcache_sb(struct super_block *);
extern void shrink_dcache_parent(struct dentry *);
+extern void shrink_dcache_anon(struct list_head *);
extern int d_invalidate(struct dentry *);
#define shrink_dcache() prune_dcache(0)
struct list_head s_dirty; /* dirty inodes */
struct list_head s_locked_inodes;/* inodes being synced */
+ struct list_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_files;
struct block_device *s_bdev;
* and must return a dentry for the referenced object or, if "parent" is
* set, a dentry for the parent of the object.
* If a dentry cannot be found, a "root" dentry should be created and
- * flaged as DCACHE_NFSD_DISCONNECTED. nfsd_iget is an example implementation.
+ * flaged as DCACHE_DISCONNECTED. nfsd_iget is an example implementation.
*
* dentry_to_fh is given a dentry and must generate the filesys specific
* part of the file handle. Available length is passed in *lenp and used
EXPORT_SYMBOL(d_move);
EXPORT_SYMBOL(d_instantiate);
EXPORT_SYMBOL(d_alloc);
+EXPORT_SYMBOL(d_alloc_anon);
+EXPORT_SYMBOL(d_splice_alias);
EXPORT_SYMBOL(d_lookup);
EXPORT_SYMBOL(__d_path);
EXPORT_SYMBOL(mark_buffer_dirty);
EXPORT_SYMBOL(prune_dcache);
EXPORT_SYMBOL(shrink_dcache_sb);
EXPORT_SYMBOL(shrink_dcache_parent);
+EXPORT_SYMBOL(shrink_dcache_anon);
EXPORT_SYMBOL(find_inode_number);
EXPORT_SYMBOL(is_subdir);
EXPORT_SYMBOL(get_unused_fd);