patch-2.4.22 linux-2.4.22/mm/shmem.c

Next file: linux-2.4.22/mm/swapfile.c
Previous file: linux-2.4.22/mm/oom_kill.c
Back to the patch index
Back to the overall index

diff -urN linux-2.4.21/mm/shmem.c linux-2.4.22/mm/shmem.c
@@ -5,7 +5,10 @@
  *		 2000 Transmeta Corp.
  *		 2000-2001 Christoph Rohland
  *		 2000-2001 SAP AG
- * 
+ *		 2002 Red Hat Inc.
+ * Copyright (C) 2002-2003 Hugh Dickins.
+ * Copyright (C) 2002-2003 VERITAS Software Corporation.
+ *
  * This file is released under the GPL.
  */
 
@@ -29,22 +32,35 @@
 #include <linux/smp_lock.h>
 
 #include <asm/uaccess.h>
+#include <asm/div64.h>
 
 /* This magic number is used in glibc for posix shared memory */
 #define TMPFS_MAGIC	0x01021994
 
 #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
+#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
 
-#define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE * (ENTRIES_PER_PAGE/2) * (ENTRIES_PER_PAGE+1))
+#define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
 #define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
-#define VM_ACCT(size)    (((size) + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT)
+
+#define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
 
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
 
 #define SHMEM_SB(sb) (&sb->u.shmem_sb)
 
+/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
+enum sgp_type {
+	SGP_READ,	/* don't exceed i_size, don't allocate page */
+	SGP_CACHE,	/* don't exceed i_size, may allocate page */
+	SGP_WRITE,	/* may exceed i_size, may allocate page */
+};
+
+static int shmem_getpage(struct inode *inode, unsigned long idx,
+			 struct page **pagep, enum sgp_type sgp);
+
 static struct super_operations shmem_ops;
 static struct address_space_operations shmem_aops;
 static struct file_operations shmem_file_operations;
@@ -52,45 +68,22 @@
 static struct inode_operations shmem_dir_inode_operations;
 static struct vm_operations_struct shmem_vm_ops;
 
-LIST_HEAD (shmem_inodes);
+LIST_HEAD(shmem_inodes);
 static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED;
-atomic_t shmem_nrpages = ATOMIC_INIT(0); /* Not used right now */
 
-static struct page *shmem_getpage_locked(struct shmem_inode_info *, struct inode *, unsigned long);
-
-/*
- * shmem_recalc_inode - recalculate the size of an inode
- *
- * @inode: inode to recalc
- * @swap:  additional swap pages freed externally
- *
- * We have to calculate the free blocks since the mm can drop pages
- * behind our back
- *
- * But we know that normally
- * inodes->i_blocks/BLOCKS_PER_PAGE == 
- * 			inode->i_mapping->nrpages + info->swapped
- *
- * So the mm freed 
- * inodes->i_blocks/BLOCKS_PER_PAGE - 
- * 			(inode->i_mapping->nrpages + info->swapped)
- *
- * It has to be called with the spinlock held.
- */
-
-static void shmem_recalc_inode(struct inode * inode)
+static void shmem_free_block(struct inode *inode)
 {
-	unsigned long freed;
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	spin_lock(&sbinfo->stat_lock);
+	sbinfo->free_blocks++;
+	inode->i_blocks -= BLOCKS_PER_PAGE;
+	spin_unlock(&sbinfo->stat_lock);
+}
 
-	freed = (inode->i_blocks/BLOCKS_PER_PAGE) -
-		(inode->i_mapping->nrpages + SHMEM_I(inode)->swapped);
-	if (freed){
-		struct shmem_sb_info * sbinfo = SHMEM_SB(inode->i_sb);
-		inode->i_blocks -= freed*BLOCKS_PER_PAGE;
-		spin_lock (&sbinfo->stat_lock);
-		sbinfo->free_blocks += freed;
-		spin_unlock (&sbinfo->stat_lock);
-	}
+static void shmem_removepage(struct page *page)
+{
+	if (!PageLaunder(page))
+		shmem_free_block(page->mapping->host);
 }
 
 /*
@@ -101,11 +94,9 @@
  * @page:  optional page to add to the structure. Has to be preset to
  *         all zeros
  *
- * If there is no space allocated yet it will return -ENOMEM when
- * page == 0 else it will use the page for the needed block.
- *
- * returns -EFBIG if the index is too big.
- *
+ * If there is no space allocated yet it will return NULL when
+ * page is 0, else it will use the page for the needed block,
+ * setting it to 0 on return to indicate that it has been used.
  *
  * The swap vector is organized the following way:
  *
@@ -133,90 +124,130 @@
  * 	      	       +-> 48-51
  * 	      	       +-> 52-55
  */
-static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index, unsigned long page) 
+static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, unsigned long *page)
 {
 	unsigned long offset;
 	void **dir;
 
 	if (index < SHMEM_NR_DIRECT)
 		return info->i_direct+index;
+	if (!info->i_indirect) {
+		if (page) {
+			info->i_indirect = (void **) *page;
+			*page = 0;
+		}
+		return NULL;			/* need another page */
+	}
 
 	index -= SHMEM_NR_DIRECT;
 	offset = index % ENTRIES_PER_PAGE;
 	index /= ENTRIES_PER_PAGE;
+	dir = info->i_indirect;
 
-	if (!info->i_indirect) {
-		info->i_indirect = (void *) page;
-		return ERR_PTR(-ENOMEM);
-	}
-
-	dir = info->i_indirect + index;
 	if (index >= ENTRIES_PER_PAGE/2) {
 		index -= ENTRIES_PER_PAGE/2;
-		dir = info->i_indirect + ENTRIES_PER_PAGE/2 
-			+ index/ENTRIES_PER_PAGE;
+		dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
 		index %= ENTRIES_PER_PAGE;
-
-		if(!*dir) {
-			*dir = (void *) page;
-			/* We return since we will need another page
-                           in the next step */
-			return ERR_PTR(-ENOMEM);
+		if (!*dir) {
+			if (page) {
+				*dir = (void *) *page;
+				*page = 0;
+			}
+			return NULL;		/* need another page */
 		}
-		dir = ((void **)*dir) + index;
+		dir = (void **) *dir;
 	}
+
+	dir += index;
 	if (!*dir) {
-		if (!page)
-			return ERR_PTR(-ENOMEM);
-		*dir = (void *)page;
+		if (!page || !*page)
+			return NULL;		/* need a page */
+		*dir = (void *) *page;
+		*page = 0;
 	}
-	return ((swp_entry_t *)*dir) + offset;
+	return (swp_entry_t *) *dir + offset;
 }
 
 /*
- * shmem_alloc_entry - get the position of the swap entry for the
- *                     page. If it does not exist allocate the entry
+ * shmem_swp_alloc - get the position of the swap entry for the page.
+ *                   If it does not exist allocate the entry.
  *
  * @info:	info structure for the inode
  * @index:	index of the page to find
+ * @sgp:	check and recheck i_size? skip allocation?
  */
-static inline swp_entry_t * shmem_alloc_entry (struct shmem_inode_info *info, unsigned long index)
+static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
 {
+	struct inode *inode = info->inode;
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	unsigned long page = 0;
-	swp_entry_t * res;
+	swp_entry_t *entry;
+	static const swp_entry_t unswapped = {0};
 
-	if (index >= SHMEM_MAX_INDEX)
-		return ERR_PTR(-EFBIG);
+	if (sgp != SGP_WRITE &&
+	    ((loff_t) index << PAGE_CACHE_SHIFT) >= inode->i_size)
+		return ERR_PTR(-EINVAL);
 
-	if (info->next_index <= index)
-		info->next_index = index + 1;
+	while (!(entry = shmem_swp_entry(info, index, &page))) {
+		if (sgp == SGP_READ)
+			return (swp_entry_t *) &unswapped;
+		/*
+		 * Test free_blocks against 1 not 0, since we have 1 data
+		 * page (and perhaps indirect index pages) yet to allocate:
+		 * a waste to allocate index if we cannot allocate data.
+		 */
+		spin_lock(&sbinfo->stat_lock);
+		if (sbinfo->free_blocks <= 1) {
+			spin_unlock(&sbinfo->stat_lock);
+			return ERR_PTR(-ENOSPC);
+		}
+		sbinfo->free_blocks--;
+		inode->i_blocks += BLOCKS_PER_PAGE;
+		spin_unlock(&sbinfo->stat_lock);
 
-	while ((res = shmem_swp_entry(info,index,page)) == ERR_PTR(-ENOMEM)) {
+		spin_unlock(&info->lock);
 		page = get_zeroed_page(GFP_USER);
-		if (!page)
+		spin_lock(&info->lock);
+
+		if (!page) {
+			shmem_free_block(inode);
+			return ERR_PTR(-ENOMEM);
+		}
+		if (sgp != SGP_WRITE &&
+		    ((loff_t) index << PAGE_CACHE_SHIFT) >= inode->i_size) {
+			entry = ERR_PTR(-EINVAL);
 			break;
+		}
+		if (info->next_index <= index)
+			info->next_index = index + 1;
 	}
-	return res;
+	if (page) {
+		/* another task gave its page, or truncated the file */
+		shmem_free_block(inode);
+		free_page(page);
+	}
+	if (info->next_index <= index && !IS_ERR(entry))
+		info->next_index = index + 1;
+	return entry;
 }
 
 /*
  * shmem_free_swp - free some swap entries in a directory
  *
  * @dir:   pointer to the directory
- * @count: number of entries to scan
+ * @edir:  pointer after last entry of the directory
  */
-static int shmem_free_swp(swp_entry_t *dir, unsigned int count)
+static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
 {
-	swp_entry_t *ptr, entry;
+	swp_entry_t *ptr;
 	int freed = 0;
 
-	for (ptr = dir; ptr < dir + count; ptr++) {
-		if (!ptr->val)
-			continue;
-		entry = *ptr;
-		*ptr = (swp_entry_t){0};
-		freed++;
-		free_swap_and_cache(entry);
+	for (ptr = dir; ptr < edir; ptr++) {
+		if (ptr->val) {
+			free_swap_and_cache(*ptr);
+			*ptr = (swp_entry_t){0};
+			freed++;
+		}
 	}
 	return freed;
 }
@@ -225,41 +256,40 @@
  * shmem_truncate_direct - free the swap entries of a whole doubly
  *                         indirect block
  *
+ * @info:	the info structure of the inode
  * @dir:	pointer to the pointer to the block
  * @start:	offset to start from (in pages)
  * @len:	how many pages are stored in this block
- *
- * Returns the number of freed swap entries.
  */
-
-static inline unsigned long 
-shmem_truncate_direct(swp_entry_t *** dir, unsigned long start, unsigned long len) {
+static inline unsigned long
+shmem_truncate_direct(struct shmem_inode_info *info, swp_entry_t ***dir, unsigned long start, unsigned long len)
+{
 	swp_entry_t **last, **ptr;
-	unsigned long off, freed = 0;
- 
-	if (!*dir)
-		return 0;
+	unsigned long off, freed_swp, freed = 0;
 
-	last = *dir + (len + ENTRIES_PER_PAGE-1) / ENTRIES_PER_PAGE;
+	last = *dir + (len + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE;
 	off = start % ENTRIES_PER_PAGE;
 
-	for (ptr = *dir + start/ENTRIES_PER_PAGE; ptr < last; ptr++) {
-		if (!*ptr) {
-			off = 0;
+	for (ptr = *dir + start/ENTRIES_PER_PAGE; ptr < last; ptr++, off = 0) {
+		if (!*ptr)
 			continue;
+
+		if (info->swapped) {
+			freed_swp = shmem_free_swp(*ptr + off,
+						*ptr + ENTRIES_PER_PAGE);
+			info->swapped -= freed_swp;
+			freed += freed_swp;
 		}
 
 		if (!off) {
-			freed += shmem_free_swp(*ptr, ENTRIES_PER_PAGE);
-			free_page ((unsigned long) *ptr);
+			freed++;
+			free_page((unsigned long) *ptr);
 			*ptr = 0;
-		} else {
-			freed += shmem_free_swp(*ptr+off,ENTRIES_PER_PAGE-off);
-			off = 0;
 		}
 	}
-	
+
 	if (!start) {
+		freed++;
 		free_page((unsigned long) *dir);
 		*dir = 0;
 	}
@@ -279,29 +309,33 @@
 shmem_truncate_indirect(struct shmem_inode_info *info, unsigned long index)
 {
 	swp_entry_t ***base;
-	unsigned long baseidx, len, start;
-	unsigned long max = info->next_index-1;
+	unsigned long baseidx, start;
+	unsigned long len = info->next_index;
+	unsigned long freed;
 
-	if (max < SHMEM_NR_DIRECT) {
+	if (len <= SHMEM_NR_DIRECT) {
 		info->next_index = index;
-		return shmem_free_swp(info->i_direct + index,
-				      SHMEM_NR_DIRECT - index);
+		if (!info->swapped)
+			return 0;
+		freed = shmem_free_swp(info->i_direct + index,
+					info->i_direct + len);
+		info->swapped -= freed;
+		return freed;
 	}
 
-	if (max < ENTRIES_PER_PAGE * ENTRIES_PER_PAGE/2 + SHMEM_NR_DIRECT) {
-		max -= SHMEM_NR_DIRECT;
+	if (len <= ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT) {
+		len -= SHMEM_NR_DIRECT;
 		base = (swp_entry_t ***) &info->i_indirect;
 		baseidx = SHMEM_NR_DIRECT;
-		len = max+1;
 	} else {
-		max -= ENTRIES_PER_PAGE*ENTRIES_PER_PAGE/2+SHMEM_NR_DIRECT;
-		if (max >= ENTRIES_PER_PAGE*ENTRIES_PER_PAGE*ENTRIES_PER_PAGE/2)
-			BUG();
-
-		baseidx = max & ~(ENTRIES_PER_PAGE*ENTRIES_PER_PAGE-1);
-		base = (swp_entry_t ***) info->i_indirect + ENTRIES_PER_PAGE/2 + baseidx/ENTRIES_PER_PAGE/ENTRIES_PER_PAGE ;
-		len = max - baseidx + 1;
-		baseidx += ENTRIES_PER_PAGE*ENTRIES_PER_PAGE/2+SHMEM_NR_DIRECT;
+		len -= ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT;
+		BUG_ON(len > ENTRIES_PER_PAGEPAGE*ENTRIES_PER_PAGE/2);
+		baseidx = len - 1;
+		baseidx -= baseidx % ENTRIES_PER_PAGEPAGE;
+		base = (swp_entry_t ***) info->i_indirect +
+			ENTRIES_PER_PAGE/2 + baseidx/ENTRIES_PER_PAGEPAGE;
+		len -= baseidx;
+		baseidx += ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT;
 	}
 
 	if (index > baseidx) {
@@ -311,89 +345,106 @@
 		info->next_index = baseidx;
 		start = 0;
 	}
-	return shmem_truncate_direct(base, start, len);
+	return *base? shmem_truncate_direct(info, base, start, len): 0;
 }
 
-static void shmem_truncate (struct inode * inode)
+static void shmem_truncate(struct inode *inode)
 {
-	unsigned long index;
-	unsigned long partial;
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	unsigned long freed = 0;
-	struct shmem_inode_info * info = SHMEM_I(inode);
+	unsigned long index;
 
-	down(&info->sem);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	spin_lock (&info->lock);
 	index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	partial = inode->i_size & ~PAGE_CACHE_MASK;
+	if (index >= info->next_index)
+		return;
 
-	if (partial) {
-		swp_entry_t *entry = shmem_swp_entry(info, index-1, 0);
-		struct page *page;
-		/*
-		 * This check is racy: it's faintly possible that page
-		 * was assigned to swap during truncate_inode_pages,
-		 * and now assigned to file; but better than nothing.
-		 */
-		if (!IS_ERR(entry) && entry->val) {
-			spin_unlock(&info->lock);
-			page = shmem_getpage_locked(info, inode, index-1);
-			if (!IS_ERR(page)) {
-				memclear_highpage_flush(page, partial,
-					PAGE_CACHE_SIZE - partial);
-				UnlockPage(page);
-				page_cache_release(page);
+	spin_lock(&info->lock);
+	while (index < info->next_index)
+		freed += shmem_truncate_indirect(info, index);
+	BUG_ON(info->swapped > info->next_index);
+	spin_unlock(&info->lock);
+
+	spin_lock(&sbinfo->stat_lock);
+	sbinfo->free_blocks += freed;
+	inode->i_blocks -= freed*BLOCKS_PER_PAGE;
+	spin_unlock(&sbinfo->stat_lock);
+}
+
+static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct page *page = NULL;
+	int error;
+
+	if (attr->ia_valid & ATTR_SIZE) {
+		if (attr->ia_size < inode->i_size) {
+			/*
+			 * If truncating down to a partial page, then
+			 * if that page is already allocated, hold it
+			 * in memory until the truncation is over, so
+			 * truncate_partial_page cannnot miss it were
+			 * it assigned to swap.
+			 */
+			if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
+				(void) shmem_getpage(inode,
+					attr->ia_size>>PAGE_CACHE_SHIFT,
+						&page, SGP_READ);
 			}
-			spin_lock(&info->lock);
 		}
 	}
 
-	while (index < info->next_index) 
-		freed += shmem_truncate_indirect(info, index);
-
-	info->swapped -= freed;
-	shmem_recalc_inode(inode);
-	spin_unlock (&info->lock);
-	up(&info->sem);
+	error = inode_change_ok(inode, attr);
+	if (!error)
+		error = inode_setattr(inode, attr);
+	if (page)
+		page_cache_release(page);
+	return error;
 }
 
-static void shmem_delete_inode(struct inode * inode)
+static void shmem_delete_inode(struct inode *inode)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	if (inode->i_op->truncate == shmem_truncate) {
-		spin_lock (&shmem_ilock);
-		list_del (&SHMEM_I(inode)->list);
-		spin_unlock (&shmem_ilock);
+		spin_lock(&shmem_ilock);
+		list_del(&info->list);
+		spin_unlock(&shmem_ilock);
 		inode->i_size = 0;
-		shmem_truncate (inode);
+		shmem_truncate(inode);
 	}
-	spin_lock (&sbinfo->stat_lock);
+	BUG_ON(inode->i_blocks);
+	spin_lock(&sbinfo->stat_lock);
 	sbinfo->free_inodes++;
-	spin_unlock (&sbinfo->stat_lock);
+	spin_unlock(&sbinfo->stat_lock);
 	clear_inode(inode);
 }
 
-static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *ptr, swp_entry_t *eptr)
+static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
 {
-	swp_entry_t *test;
+	swp_entry_t *ptr;
 
-	for (test = ptr; test < eptr; test++) {
-		if (test->val == entry.val)
-			return test - ptr;
+	for (ptr = dir; ptr < edir; ptr++) {
+		if (ptr->val == entry.val)
+			return ptr - dir;
 	}
 	return -1;
 }
 
 static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
 {
+	struct inode *inode;
+	struct address_space *mapping;
 	swp_entry_t *ptr;
 	unsigned long idx;
+	unsigned long limit;
 	int offset;
 
 	idx = 0;
 	ptr = info->i_direct;
-	spin_lock (&info->lock);
+	spin_lock(&info->lock);
 	offset = info->next_index;
 	if (offset > SHMEM_NR_DIRECT)
 		offset = SHMEM_NR_DIRECT;
@@ -401,10 +452,10 @@
 	if (offset >= 0)
 		goto found;
 
-	for (idx = SHMEM_NR_DIRECT; idx < info->next_index; 
+	for (idx = SHMEM_NR_DIRECT; idx < info->next_index;
 	     idx += ENTRIES_PER_PAGE) {
-		ptr = shmem_swp_entry(info, idx, 0);
-		if (IS_ERR(ptr))
+		ptr = shmem_swp_entry(info, idx, NULL);
+		if (!ptr)
 			continue;
 		offset = info->next_index - idx;
 		if (offset > ENTRIES_PER_PAGE)
@@ -413,50 +464,62 @@
 		if (offset >= 0)
 			goto found;
 	}
-	spin_unlock (&info->lock);
+	spin_unlock(&info->lock);
 	return 0;
 found:
-	swap_free(entry);
-	ptr[offset] = (swp_entry_t) {0};
+	idx += offset;
+	inode = info->inode;
+	mapping = inode->i_mapping;
 	delete_from_swap_cache(page);
-	add_to_page_cache(page, info->inode->i_mapping, offset + idx);
-	SetPageDirty(page);
-	SetPageUptodate(page);
-	info->swapped--;
+
+	/* Racing against delete or truncate? Must leave out of page cache */
+	limit = (inode->i_state & I_FREEING)? 0:
+		(inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	if (idx >= limit || add_to_page_cache_unique(page,
+			mapping, idx, page_hash(mapping, idx)) == 0) {
+		ptr[offset].val = 0;
+		info->swapped--;
+	} else if (add_to_swap_cache(page, entry) != 0)
+		BUG();
 	spin_unlock(&info->lock);
+	SetPageUptodate(page);
+	/*
+	 * Decrement swap count even when the entry is left behind:
+	 * try_to_unuse will skip over mms, then reincrement count.
+	 */
+	swap_free(entry);
 	return 1;
 }
 
 /*
  * shmem_unuse() search for an eventually swapped out shmem page.
  */
-void shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t entry, struct page *page)
 {
 	struct list_head *p;
-	struct shmem_inode_info * info;
+	struct shmem_inode_info *info;
+	int found = 0;
 
-	spin_lock (&shmem_ilock);
+	spin_lock(&shmem_ilock);
 	list_for_each(p, &shmem_inodes) {
 		info = list_entry(p, struct shmem_inode_info, list);
 
 		if (info->swapped && shmem_unuse_inode(info, entry, page)) {
 			/* move head to start search for next from here */
-			list_del(&shmem_inodes);
-			list_add_tail(&shmem_inodes, p);
+			list_move_tail(&shmem_inodes, &info->list);
+			found = 1;
 			break;
 		}
 	}
-	spin_unlock (&shmem_ilock);
+	spin_unlock(&shmem_ilock);
+	return found;
 }
 
 /*
  * Move the page from the page cache to the swap cache.
- *
- * The page lock prevents multiple occurences of shmem_writepage at
- * once.  We still need to guard against racing with
- * shmem_getpage_locked().  
  */
-static int shmem_writepage(struct page * page)
+static int shmem_writepage(struct page *page)
 {
 	struct shmem_inode_info *info;
 	swp_entry_t *entry, swap;
@@ -464,8 +527,7 @@
 	unsigned long index;
 	struct inode *inode;
 
-	if (!PageLocked(page))
-		BUG();
+	BUG_ON(!PageLocked(page));
 	if (!PageLaunder(page))
 		return fail_writepage(page);
 
@@ -473,7 +535,7 @@
 	index = page->index;
 	inode = mapping->host;
 	info = SHMEM_I(inode);
-	if (info->locked)
+	if (info->flags & VM_LOCKED)
 		return fail_writepage(page);
 getswap:
 	swap = get_swap_page();
@@ -481,12 +543,10 @@
 		return fail_writepage(page);
 
 	spin_lock(&info->lock);
-	entry = shmem_swp_entry(info, index, 0);
-	if (IS_ERR(entry))	/* this had been allocated on page allocation */
-		BUG();
-	shmem_recalc_inode(inode);
-	if (entry->val)
-		BUG();
+	BUG_ON(index >= info->next_index);
+	entry = shmem_swp_entry(info, index, NULL);
+	BUG_ON(!entry);
+	BUG_ON(entry->val);
 
 	/* Remove it from the page cache */
 	remove_inode_page(page);
@@ -514,204 +574,251 @@
 }
 
 /*
- * shmem_getpage_locked - either get the page from swap or allocate a new one
+ * shmem_getpage - either get the page from swap or allocate a new one
  *
  * If we allocate a new one we do not mark it dirty. That's up to the
  * vm. If we swap it in we mark it dirty since we also free the swap
  * entry since a page cannot live in both the swap and page cache
- *
- * Called with the inode locked, so it cannot race with itself, but we
- * still need to guard against racing with shm_writepage(), which might
- * be trying to move the page to the swap cache as we run.
  */
-static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode * inode, unsigned long idx)
+static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep, enum sgp_type sgp)
 {
-	struct address_space * mapping = inode->i_mapping;
+	struct address_space *mapping = inode->i_mapping;
+	struct shmem_inode_info *info = SHMEM_I(inode);
 	struct shmem_sb_info *sbinfo;
-	struct page * page;
+	struct page *filepage = *pagep;
+	struct page *swappage;
 	swp_entry_t *entry;
+	swp_entry_t swap;
+	int error = 0;
 
+	if (idx >= SHMEM_MAX_INDEX)
+		return -EFBIG;
+	/*
+	 * Normally, filepage is NULL on entry, and either found
+	 * uptodate immediately, or allocated and zeroed, or read
+	 * in under swappage, which is then assigned to filepage.
+	 * But shmem_readpage and shmem_prepare_write pass in a locked
+	 * filepage, which may be found not uptodate by other callers
+	 * too, and may need to be copied from the swappage read in.
+	 */
 repeat:
-	page = find_lock_page(mapping, idx);
-	if (page)
-		return page;
+	if (!filepage)
+		filepage = find_lock_page(mapping, idx);
+	if (filepage && Page_Uptodate(filepage))
+		goto done;
 
-	entry = shmem_alloc_entry (info, idx);
-	if (IS_ERR(entry))
-		return (void *)entry;
-
-	spin_lock (&info->lock);
-	
-	/* The shmem_alloc_entry() call may have blocked, and
-	 * shmem_writepage may have been moving a page between the page
-	 * cache and swap cache.  We need to recheck the page cache
-	 * under the protection of the info->lock spinlock. */
-
-	page = find_get_page(mapping, idx);
-	if (page) {
-		if (TryLockPage(page))
-			goto wait_retry;
-		spin_unlock (&info->lock);
-		return page;
-	}
-	
-	shmem_recalc_inode(inode);
-	if (entry->val) {
-		unsigned long flags;
+	spin_lock(&info->lock);
+	entry = shmem_swp_alloc(info, idx, sgp);
+	if (IS_ERR(entry)) {
+		spin_unlock(&info->lock);
+		error = PTR_ERR(entry);
+		goto failed;
+	}
+	swap = *entry;
 
+	if (swap.val) {
 		/* Look it up and read it in.. */
-		page = lookup_swap_cache(*entry);
-		if (!page) {
-			swp_entry_t swap = *entry;
-			spin_unlock (&info->lock);
-			swapin_readahead(*entry);
-			page = read_swap_cache_async(*entry);
-			if (!page) {
-				if (entry->val != swap.val)
-					goto repeat;
-				return ERR_PTR(-ENOMEM);
-			}
-			wait_on_page(page);
-			if (!Page_Uptodate(page) && entry->val == swap.val) {
-				page_cache_release(page);
-				return ERR_PTR(-EIO);
+		swappage = lookup_swap_cache(swap);
+		if (!swappage) {
+			spin_unlock(&info->lock);
+			swapin_readahead(swap);
+			swappage = read_swap_cache_async(swap);
+			if (!swappage) {
+				spin_lock(&info->lock);
+				entry = shmem_swp_alloc(info, idx, sgp);
+				if (IS_ERR(entry))
+					error = PTR_ERR(entry);
+				else if (entry->val == swap.val)
+					error = -ENOMEM;
+				spin_unlock(&info->lock);
+				if (error)
+					goto failed;
+				goto repeat;
 			}
-			
-			/* Too bad we can't trust this page, because we
-			 * dropped the info->lock spinlock */
-			page_cache_release(page);
+			wait_on_page(swappage);
+			page_cache_release(swappage);
 			goto repeat;
 		}
 
-		/* We have to this with page locked to prevent races */
-		if (TryLockPage(page)) 
-			goto wait_retry;
-
-		swap_free(*entry);
-		*entry = (swp_entry_t) {0};
-		delete_from_swap_cache(page);
-		flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1));
-		page->flags = flags | (1 << PG_dirty);
-		add_to_page_cache_locked(page, mapping, idx);
-		info->swapped--;
-		spin_unlock (&info->lock);
+		/* We have to do this with page locked to prevent races */
+		if (TryLockPage(swappage)) {
+			spin_unlock(&info->lock);
+			wait_on_page(swappage);
+			page_cache_release(swappage);
+			goto repeat;
+		}
+		if (!Page_Uptodate(swappage)) {
+			spin_unlock(&info->lock);
+			UnlockPage(swappage);
+			page_cache_release(swappage);
+			error = -EIO;
+			goto failed;
+		}
+
+		delete_from_swap_cache(swappage);
+		if (filepage) {
+			entry->val = 0;
+			info->swapped--;
+			spin_unlock(&info->lock);
+			flush_page_to_ram(swappage);
+			copy_highpage(filepage, swappage);
+			UnlockPage(swappage);
+			page_cache_release(swappage);
+			flush_dcache_page(filepage);
+			SetPageUptodate(filepage);
+			SetPageDirty(filepage);
+			swap_free(swap);
+		} else if (add_to_page_cache_unique(swappage,
+			mapping, idx, page_hash(mapping, idx)) == 0) {
+			entry->val = 0;
+			info->swapped--;
+			spin_unlock(&info->lock);
+			filepage = swappage;
+			SetPageUptodate(filepage);
+			SetPageDirty(filepage);
+			swap_free(swap);
+		} else {
+			if (add_to_swap_cache(swappage, swap) != 0)
+				BUG();
+			spin_unlock(&info->lock);
+			SetPageUptodate(swappage);
+			SetPageDirty(swappage);
+			UnlockPage(swappage);
+			page_cache_release(swappage);
+			goto repeat;
+		}
+	} else if (sgp == SGP_READ && !filepage) {
+		filepage = find_get_page(mapping, idx);
+		if (filepage &&
+		    (!Page_Uptodate(filepage) || TryLockPage(filepage))) {
+			spin_unlock(&info->lock);
+			wait_on_page(filepage);
+			page_cache_release(filepage);
+			filepage = NULL;
+			goto repeat;
+		}
+		spin_unlock(&info->lock);
 	} else {
 		sbinfo = SHMEM_SB(inode->i_sb);
-		spin_unlock (&info->lock);
-		spin_lock (&sbinfo->stat_lock);
-		if (sbinfo->free_blocks == 0)
-			goto no_space;
+		spin_lock(&sbinfo->stat_lock);
+		if (sbinfo->free_blocks == 0) {
+			spin_unlock(&sbinfo->stat_lock);
+			spin_unlock(&info->lock);
+			error = -ENOSPC;
+			goto failed;
+		}
 		sbinfo->free_blocks--;
-		spin_unlock (&sbinfo->stat_lock);
-
-		/* Ok, get a new page.  We don't have to worry about the
-		 * info->lock spinlock here: we cannot race against
-		 * shm_writepage because we have already verified that
-		 * there is no page present either in memory or in the
-		 * swap cache, so we are guaranteed to be populating a
-		 * new shm entry.  The inode semaphore we already hold
-		 * is enough to make this atomic. */
-		page = page_cache_alloc(mapping);
-		if (!page)
-			return ERR_PTR(-ENOMEM);
-		clear_highpage(page);
-		flush_dcache_page(page);
 		inode->i_blocks += BLOCKS_PER_PAGE;
-		add_to_page_cache (page, mapping, idx);
-	}
-
-	/* We have the page */
-	SetPageUptodate(page);
-	return page;
-no_space:
-	spin_unlock (&sbinfo->stat_lock);
-	return ERR_PTR(-ENOSPC);
-
-wait_retry:
-	spin_unlock (&info->lock);
-	wait_on_page(page);
-	page_cache_release(page);
-	goto repeat;
-}
+		spin_unlock(&sbinfo->stat_lock);
 
-static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr)
-{
-	struct shmem_inode_info *info = SHMEM_I(inode);
-	int error;
-
-	down (&info->sem);
-	*ptr = ERR_PTR(-EFAULT);
-	if (inode->i_size <= (loff_t) idx * PAGE_CACHE_SIZE)
-		goto failed;
+		if (!filepage) {
+			spin_unlock(&info->lock);
+			filepage = page_cache_alloc(mapping);
+			if (!filepage) {
+				shmem_free_block(inode);
+				error = -ENOMEM;
+				goto failed;
+			}
 
-	*ptr = shmem_getpage_locked(info, inode, idx);
-	if (IS_ERR (*ptr))
-		goto failed;
+			spin_lock(&info->lock);
+			entry = shmem_swp_alloc(info, idx, sgp);
+			if (IS_ERR(entry))
+				error = PTR_ERR(entry);
+			if (error || entry->val ||
+			    add_to_page_cache_unique(filepage,
+			    mapping, idx, page_hash(mapping, idx)) != 0) {
+				spin_unlock(&info->lock);
+				page_cache_release(filepage);
+				shmem_free_block(inode);
+				filepage = NULL;
+				if (error)
+					goto failed;
+				goto repeat;
+			}
+		}
 
-	UnlockPage(*ptr);
-	up (&info->sem);
+		spin_unlock(&info->lock);
+		clear_highpage(filepage);
+		flush_dcache_page(filepage);
+		SetPageUptodate(filepage);
+	}
+done:
+	if (!*pagep) {
+		if (filepage) {
+			UnlockPage(filepage);
+			*pagep = filepage;
+		} else
+			*pagep = ZERO_PAGE(0);
+	}
 	return 0;
+
 failed:
-	up (&info->sem);
-	error = PTR_ERR(*ptr);
-	*ptr = NOPAGE_SIGBUS;
-	if (error == -ENOMEM)
-		*ptr = NOPAGE_OOM;
+	if (*pagep != filepage) {
+		UnlockPage(filepage);
+		page_cache_release(filepage);
+	}
 	return error;
 }
 
-struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int unused)
+struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int unused)
 {
-	struct page * page;
-	unsigned int idx;
-	struct inode * inode = vma->vm_file->f_dentry->d_inode;
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct page *page = NULL;
+	unsigned long idx;
+	int error;
 
-	idx = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
+	idx = (address - vma->vm_start) >> PAGE_SHIFT;
 	idx += vma->vm_pgoff;
+	idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 
-	if (shmem_getpage(inode, idx, &page))
-		return page;
+	error = shmem_getpage(inode, idx, &page, SGP_CACHE);
+	if (error)
+		return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
 
+	mark_page_accessed(page);
 	flush_page_to_ram(page);
-	return(page);
+	return page;
 }
 
-void shmem_lock(struct file * file, int lock)
+void shmem_lock(struct file *file, int lock)
 {
-	struct inode * inode = file->f_dentry->d_inode;
-	struct shmem_inode_info * info = SHMEM_I(inode);
+	struct inode *inode = file->f_dentry->d_inode;
+	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	spin_lock(&info->lock);
-	info->locked = lock;
+	if (lock)
+		info->flags |= VM_LOCKED;
+	else
+		info->flags &= ~VM_LOCKED;
 	spin_unlock(&info->lock);
 }
 
-static int shmem_mmap(struct file * file, struct vm_area_struct * vma)
+static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct vm_operations_struct * ops;
+	struct vm_operations_struct *ops;
 	struct inode *inode = file->f_dentry->d_inode;
 
 	ops = &shmem_vm_ops;
-	if (!inode->i_sb || !S_ISREG(inode->i_mode))
+	if (!S_ISREG(inode->i_mode))
 		return -EACCES;
 	UPDATE_ATIME(inode);
 	vma->vm_ops = ops;
 	return 0;
 }
 
-struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev)
+static struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev)
 {
-	struct inode * inode;
+	struct inode *inode;
 	struct shmem_inode_info *info;
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 
-	spin_lock (&sbinfo->stat_lock);
+	spin_lock(&sbinfo->stat_lock);
 	if (!sbinfo->free_inodes) {
-		spin_unlock (&sbinfo->stat_lock);
+		spin_unlock(&sbinfo->stat_lock);
 		return NULL;
 	}
 	sbinfo->free_inodes--;
-	spin_unlock (&sbinfo->stat_lock);
+	spin_unlock(&sbinfo->stat_lock);
 
 	inode = new_inode(sb);
 	if (inode) {
@@ -725,8 +832,7 @@
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		info = SHMEM_I(inode);
 		info->inode = inode;
-		spin_lock_init (&info->lock);
-		sema_init (&info->sem, 1);
+		spin_lock_init(&info->lock);
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
@@ -734,9 +840,9 @@
 		case S_IFREG:
 			inode->i_op = &shmem_inode_operations;
 			inode->i_fop = &shmem_file_operations;
-			spin_lock (&shmem_ilock);
+			spin_lock(&shmem_ilock);
 			list_add_tail(&info->list, &shmem_inodes);
-			spin_unlock (&shmem_ilock);
+			spin_unlock(&shmem_ilock);
 			break;
 		case S_IFDIR:
 			inode->i_nlink++;
@@ -781,16 +887,46 @@
 static struct inode_operations shmem_symlink_inode_operations;
 static struct inode_operations shmem_symlink_inline_operations;
 
+/*
+ * tmpfs itself makes no use of generic_file_read, generic_file_mmap
+ * or generic_file_write; but shmem_readpage, shmem_prepare_write and
+ * shmem_commit_write let a tmpfs file be used below the loop driver,
+ * and shmem_readpage lets a tmpfs file be used by sendfile.
+ */
+static int
+shmem_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	int error = shmem_getpage(inode, page->index, &page, SGP_CACHE);
+	UnlockPage(page);
+	return error;
+}
+
+static int
+shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	return shmem_getpage(inode, page->index, &page, SGP_WRITE);
+}
+
+static int
+shmem_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	if (pos > inode->i_size)
+		inode->i_size = pos;
+	SetPageDirty(page);
+	return 0;
+}
+
 static ssize_t
-shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
+shmem_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
 {
-	struct inode	*inode = file->f_dentry->d_inode; 
-	struct shmem_inode_info *info;
-	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+	struct inode	*inode = file->f_dentry->d_inode;
 	loff_t		pos;
-	struct page	*page;
 	unsigned long	written;
-	long		status;
 	int		err;
 
 	if ((ssize_t) count < 0)
@@ -802,119 +938,69 @@
 	down(&inode->i_sem);
 
 	pos = *ppos;
-	err = -EINVAL;
-	if (pos < 0)
-		goto out;
-
-	err = file->f_error;
-	if (err) {
-		file->f_error = 0;
-		goto out;
-	}
-
 	written = 0;
 
-	if (file->f_flags & O_APPEND)
-		pos = inode->i_size;
-
-	/*
-	 * Check whether we've reached the file size limit.
-	 */
-	err = -EFBIG;
-	if (limit != RLIM_INFINITY) {
-		if (pos >= limit) {
-			send_sig(SIGXFSZ, current, 0);
-			goto out;
-		}
-		if (count > limit - pos) {
-			send_sig(SIGXFSZ, current, 0);
-			count = limit - pos;
-		}
-	}
+	err = precheck_file_write(file, inode, &count, &pos);
+	if (err || !count)
+		goto out;
 
-	status	= 0;
-	if (count) {
-		remove_suid(inode);
-		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	}
+	remove_suid(inode);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 
-	while (count) {
+	do {
+		struct page *page = NULL;
 		unsigned long bytes, index, offset;
 		char *kaddr;
+		int left;
 
-		/*
-		 * Try to find the page in the cache. If it isn't there,
-		 * allocate a free page.
-		 */
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count) {
+		if (bytes > count)
 			bytes = count;
-		}
 
 		/*
-		 * Bring in the user page that we will copy from _first_.
-		 * Otherwise there's a nasty deadlock on copying from the
-		 * same page as we're writing to, without it being marked
-		 * up-to-date.
+		 * We don't hold page lock across copy from user -
+		 * what would it guard against? - so no deadlock here.
 		 */
-		{ volatile unsigned char dummy;
-			__get_user(dummy, buf);
-			__get_user(dummy, buf+bytes-1);
-		}
 
-		info = SHMEM_I(inode);
-		down (&info->sem);
-		page = shmem_getpage_locked(info, inode, index);
-		up (&info->sem);
-
-		status = PTR_ERR(page);
-		if (IS_ERR(page))
+		err = shmem_getpage(inode, index, &page, SGP_WRITE);
+		if (err)
 			break;
 
-		/* We have exclusive IO access to the page.. */
-		if (!PageLocked(page)) {
-			PAGE_BUG(page);
-		}
-
 		kaddr = kmap(page);
-		status = copy_from_user(kaddr+offset, buf, bytes);
+		left = __copy_from_user(kaddr + offset, buf, bytes);
 		kunmap(page);
-		if (status)
-			goto fail_write;
+
+		written += bytes;
+		count -= bytes;
+		pos += bytes;
+		buf += bytes;
+		if (pos > inode->i_size)
+			inode->i_size = pos;
 
 		flush_dcache_page(page);
-		if (bytes > 0) {
-			SetPageDirty(page);
-			written += bytes;
-			count -= bytes;
-			pos += bytes;
-			buf += bytes;
-			if (pos > inode->i_size) 
-				inode->i_size = pos;
-		}
-unlock:
-		/* Mark it unlocked again and drop the page.. */
-		UnlockPage(page);
+		SetPageDirty(page);
+		SetPageReferenced(page);
 		page_cache_release(page);
 
-		if (status < 0)
+		if (left) {
+			pos -= left;
+			written -= left;
+			err = -EFAULT;
 			break;
-	}
-	*ppos = pos;
+		}
+	} while (count);
 
-	err = written ? written : status;
+	*ppos = pos;
+	if (written)
+		err = written;
 out:
 	up(&inode->i_sem);
 	return err;
-fail_write:
-	status = -EFAULT;
-	ClearPageUptodate(page);
-	goto unlock;
 }
 
-static void do_shmem_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc)
+static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
@@ -924,26 +1010,55 @@
 	offset = *ppos & ~PAGE_CACHE_MASK;
 
 	for (;;) {
-		struct page *page;
+		struct page *page = NULL;
 		unsigned long end_index, nr, ret;
 
 		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 		if (index > end_index)
 			break;
-		nr = PAGE_CACHE_SIZE;
 		if (index == end_index) {
 			nr = inode->i_size & ~PAGE_CACHE_MASK;
 			if (nr <= offset)
 				break;
 		}
 
-		nr = nr - offset;
-
-		if ((desc->error = shmem_getpage(inode, index, &page)))
+		desc->error = shmem_getpage(inode, index, &page, SGP_READ);
+		if (desc->error) {
+			if (desc->error == -EINVAL)
+				desc->error = 0;
 			break;
+		}
 
-		if (mapping->i_mmap_shared != NULL)
-			flush_dcache_page(page);
+		/*
+		 * We must evaluate after, since reads (unlike writes)
+		 * are called without i_sem protection against truncate
+		 */
+		nr = PAGE_CACHE_SIZE;
+		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+		if (index == end_index) {
+			nr = inode->i_size & ~PAGE_CACHE_MASK;
+			if (nr <= offset) {
+				page_cache_release(page);
+				break;
+			}
+		}
+		nr -= offset;
+
+		if (page != ZERO_PAGE(0)) {
+			/*
+			 * If users can be writing to this page using arbitrary
+			 * virtual addresses, take care about potential aliasing
+			 * before reading the page on the kernel side.
+			 */
+			if (mapping->i_mmap_shared != NULL)
+				flush_dcache_page(page);
+			/*
+			 * Mark the page accessed if we read the
+			 * beginning or we just did an lseek.
+			 */
+			if (!offset || !filp->f_reada)
+				mark_page_accessed(page);
+		}
 
 		/*
 		 * Ok, we have the page, and it's up-to-date, so
@@ -959,39 +1074,37 @@
 		offset += ret;
 		index += offset >> PAGE_CACHE_SHIFT;
 		offset &= ~PAGE_CACHE_MASK;
-	
+
 		page_cache_release(page);
 		if (ret != nr || !desc->count)
 			break;
 	}
 
 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+	filp->f_reada = 1;
 	UPDATE_ATIME(inode);
 }
 
-static ssize_t shmem_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+static ssize_t shmem_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
 {
-	ssize_t retval;
-
-	retval = -EFAULT;
-	if (access_ok(VERIFY_WRITE, buf, count)) {
-		retval = 0;
-
-		if (count) {
-			read_descriptor_t desc;
+	read_descriptor_t desc;
 
-			desc.written = 0;
-			desc.count = count;
-			desc.buf = buf;
-			desc.error = 0;
-			do_shmem_file_read(filp, ppos, &desc);
+	if ((ssize_t) count < 0)
+		return -EINVAL;
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+	if (!count)
+		return 0;
 
-			retval = desc.written;
-			if (!retval)
-				retval = desc.error;
-		}
-	}
-	return retval;
+	desc.written = 0;
+	desc.count = count;
+	desc.buf = buf;
+	desc.error = 0;
+
+	do_shmem_file_read(filp, ppos, &desc);
+	if (desc.written)
+		return desc.written;
+	return desc.error;
 }
 
 static int shmem_statfs(struct super_block *sb, struct statfs *buf)
@@ -1000,12 +1113,12 @@
 
 	buf->f_type = TMPFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
-	spin_lock (&sbinfo->stat_lock);
+	spin_lock(&sbinfo->stat_lock);
 	buf->f_blocks = sbinfo->max_blocks;
 	buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
 	buf->f_files = sbinfo->max_inodes;
 	buf->f_ffree = sbinfo->free_inodes;
-	spin_unlock (&sbinfo->stat_lock);
+	spin_unlock(&sbinfo->stat_lock);
 	buf->f_namelen = NAME_MAX;
 	return 0;
 }
@@ -1014,7 +1127,7 @@
  * Lookup the data. This is trivial - if the dentry didn't already
  * exist, we know it is negative.
  */
-static struct dentry * shmem_lookup(struct inode *dir, struct dentry *dentry)
+static struct dentry *shmem_lookup(struct inode *dir, struct dentry *dentry)
 {
 	d_add(dentry, NULL);
 	return NULL;
@@ -1025,7 +1138,7 @@
  */
 static int shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, int dev)
 {
-	struct inode * inode = shmem_get_inode(dir->i_sb, mode, dev);
+	struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
 	int error = -ENOSPC;
 
 	if (inode) {
@@ -1038,7 +1151,7 @@
 	return error;
 }
 
-static int shmem_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	int error;
 
@@ -1056,7 +1169,7 @@
 /*
  * Link a file..
  */
-static int shmem_link(struct dentry *old_dentry, struct inode * dir, struct dentry * dentry)
+static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = old_dentry->d_inode;
 
@@ -1105,7 +1218,7 @@
 	return 1;
 }
 
-static int shmem_unlink(struct inode * dir, struct dentry *dentry)
+static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 
@@ -1116,7 +1229,7 @@
 	return 0;
 }
 
-static int shmem_rmdir(struct inode * dir, struct dentry *dentry)
+static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	if (!shmem_empty(dentry))
 		return -ENOTEMPTY;
@@ -1131,12 +1244,12 @@
  * it exists so that the VFS layer correctly free's it when it
  * gets overwritten.
  */
-static int shmem_rename(struct inode * old_dir, struct dentry *old_dentry, struct inode * new_dir,struct dentry *new_dentry)
+static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct inode *inode = old_dentry->d_inode;
 	int they_are_dirs = S_ISDIR(inode->i_mode);
 
-	if (!shmem_empty(new_dentry)) 
+	if (!shmem_empty(new_dentry))
 		return -ENOTEMPTY;
 
 	if (new_dentry->d_inode) {
@@ -1156,13 +1269,14 @@
 	return 0;
 }
 
-static int shmem_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
+static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
+	int error;
 	int len;
 	struct inode *inode;
-	struct page *page;
+	struct page *page = NULL;
 	char *kaddr;
-	struct shmem_inode_info * info;
+	struct shmem_inode_info *info;
 
 	len = strlen(symname) + 1;
 	if (len > PAGE_CACHE_SIZE)
@@ -1179,24 +1293,20 @@
 		memcpy(info, symname, len);
 		inode->i_op = &shmem_symlink_inline_operations;
 	} else {
-		down(&info->sem);
-		page = shmem_getpage_locked(info, inode, 0);
-		if (IS_ERR(page)) {
-			up(&info->sem);
+		error = shmem_getpage(inode, 0, &page, SGP_WRITE);
+		if (error) {
 			iput(inode);
-			return PTR_ERR(page);
+			return error;
 		}
 		inode->i_op = &shmem_symlink_inode_operations;
-		spin_lock (&shmem_ilock);
+		spin_lock(&shmem_ilock);
 		list_add_tail(&info->list, &shmem_inodes);
-		spin_unlock (&shmem_ilock);
+		spin_unlock(&shmem_ilock);
 		kaddr = kmap(page);
 		memcpy(kaddr, symname, len);
 		kunmap(page);
 		SetPageDirty(page);
-		UnlockPage(page);
 		page_cache_release(page);
-		up(&info->sem);
 	}
 	dir->i_size += BOGO_DIRENT_SIZE;
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1207,7 +1317,7 @@
 
 static int shmem_readlink_inline(struct dentry *dentry, char *buffer, int buflen)
 {
-	return vfs_readlink(dentry,buffer,buflen, (const char *)SHMEM_I(dentry->d_inode));
+	return vfs_readlink(dentry, buffer, buflen, (const char *)SHMEM_I(dentry->d_inode));
 }
 
 static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
@@ -1217,27 +1327,26 @@
 
 static int shmem_readlink(struct dentry *dentry, char *buffer, int buflen)
 {
-	struct page * page;
-	int res = shmem_getpage(dentry->d_inode, 0, &page);
-
+	struct page *page = NULL;
+	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ);
 	if (res)
 		return res;
-
-	res = vfs_readlink(dentry,buffer,buflen, kmap(page));
+	res = vfs_readlink(dentry, buffer, buflen, kmap(page));
 	kunmap(page);
+	mark_page_accessed(page);
 	page_cache_release(page);
 	return res;
 }
 
 static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	struct page * page;
-	int res = shmem_getpage(dentry->d_inode, 0, &page);
+	struct page *page = NULL;
+	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ);
 	if (res)
 		return res;
-
 	res = vfs_follow_link(nd, kmap(page));
 	kunmap(page);
+	mark_page_accessed(page);
 	page_cache_release(page);
 	return res;
 }
@@ -1253,19 +1362,18 @@
 	follow_link:	shmem_follow_link,
 };
 
-static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long * blocks, unsigned long *inodes)
+static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
 {
 	char *this_char, *value, *rest;
 
-	this_char = NULL;
-	if ( options )
-		this_char = strtok(options,",");
-	for ( ; this_char; this_char = strtok(NULL,",")) {
+	while ((this_char = strsep(&options, ",")) != NULL) {
+		if (!*this_char)
+			continue;
 		if ((value = strchr(this_char,'=')) != NULL) {
 			*value++ = 0;
 		} else {
-			printk(KERN_ERR 
-			    "tmpfs: No value for mount option '%s'\n", 
+			printk(KERN_ERR
+			    "tmpfs: No value for mount option '%s'\n",
 			    this_char);
 			return 1;
 		}
@@ -1273,6 +1381,14 @@
 		if (!strcmp(this_char,"size")) {
 			unsigned long long size;
 			size = memparse(value,&rest);
+			if (*rest == '%') {
+				struct sysinfo si;
+				si_meminfo(&si);
+				size <<= PAGE_SHIFT;
+				size *= si.totalram;
+				do_div(size, 100);
+				rest++;
+			}
 			if (*rest)
 				goto bad_val;
 			*blocks = size >> PAGE_CACHE_SHIFT;
@@ -1311,33 +1427,32 @@
 	return 0;
 
 bad_val:
-	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", 
+	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
 	       value, this_char);
 	return 1;
-
 }
 
-static int shmem_remount_fs (struct super_block *sb, int *flags, char *data)
+static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 {
-	struct shmem_sb_info *sbinfo = &sb->u.shmem_sb;
+	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 	unsigned long max_blocks = sbinfo->max_blocks;
 	unsigned long max_inodes = sbinfo->max_inodes;
 
-	if (shmem_parse_options (data, NULL, NULL, NULL, &max_blocks, &max_inodes))
+	if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
 		return -EINVAL;
 	return shmem_set_size(sbinfo, max_blocks, max_inodes);
 }
 
-int shmem_sync_file(struct file * file, struct dentry *dentry, int datasync)
+static int shmem_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	return 0;
 }
 #endif
 
-static struct super_block *shmem_read_super(struct super_block * sb, void * data, int silent)
+static struct super_block *shmem_read_super(struct super_block *sb, void *data, int silent)
 {
-	struct inode * inode;
-	struct dentry * root;
+	struct inode *inode;
+	struct dentry *root;
 	unsigned long blocks, inodes;
 	int mode   = S_IRWXUGO | S_ISVTX;
 	uid_t uid = current->fsuid;
@@ -1353,11 +1468,11 @@
 	blocks = inodes = si.totalram / 2;
 
 #ifdef CONFIG_TMPFS
-	if (shmem_parse_options (data, &mode, &uid, &gid, &blocks, &inodes))
+	if (shmem_parse_options(data, &mode, &uid, &gid, &blocks, &inodes))
 		return NULL;
 #endif
 
-	spin_lock_init (&sbinfo->stat_lock);
+	spin_lock_init(&sbinfo->stat_lock);
 	sbinfo->max_blocks = blocks;
 	sbinfo->free_blocks = blocks;
 	sbinfo->max_inodes = inodes;
@@ -1382,23 +1497,28 @@
 	return sb;
 }
 
-
-
 static struct address_space_operations shmem_aops = {
+	removepage:	shmem_removepage,
 	writepage:	shmem_writepage,
+#ifdef CONFIG_TMPFS
+	readpage:	shmem_readpage,
+	prepare_write:	shmem_prepare_write,
+	commit_write:	shmem_commit_write,
+#endif
 };
 
 static struct file_operations shmem_file_operations = {
-	mmap:	shmem_mmap,
+	mmap:		shmem_mmap,
 #ifdef CONFIG_TMPFS
-	read:	shmem_file_read,
-	write:	shmem_file_write,
-	fsync:	shmem_sync_file,
+	read:		shmem_file_read,
+	write:		shmem_file_write,
+	fsync:		shmem_sync_file,
 #endif
 };
 
 static struct inode_operations shmem_inode_operations = {
 	truncate:	shmem_truncate,
+	setattr:	shmem_notify_change,
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
@@ -1421,11 +1541,11 @@
 	remount_fs:	shmem_remount_fs,
 #endif
 	delete_inode:	shmem_delete_inode,
-	put_inode:	force_delete,	
+	put_inode:	force_delete,
 };
 
 static struct vm_operations_struct shmem_vm_ops = {
-	nopage:	shmem_nopage,
+	nopage:		shmem_nopage,
 };
 
 #ifdef CONFIG_TMPFS
@@ -1437,65 +1557,65 @@
 #endif
 static struct vfsmount *shm_mnt;
 
-static int __init init_shmem_fs(void)
+static int __init init_tmpfs(void)
 {
 	int error;
-	struct vfsmount * res;
 
-	if ((error = register_filesystem(&tmpfs_fs_type))) {
-		printk (KERN_ERR "Could not register tmpfs\n");
-		return error;
+	error = register_filesystem(&tmpfs_fs_type);
+	if (error) {
+		printk(KERN_ERR "Could not register tmpfs\n");
+		goto out3;
 	}
 #ifdef CONFIG_TMPFS
-	if ((error = register_filesystem(&shmem_fs_type))) {
-		printk (KERN_ERR "Could not register shm fs\n");
-		return error;
+	error = register_filesystem(&shmem_fs_type);
+	if (error) {
+		printk(KERN_ERR "Could not register shm fs\n");
+		goto out2;
 	}
-	devfs_mk_dir (NULL, "shm", NULL);
+	devfs_mk_dir(NULL, "shm", NULL);
 #endif
-	res = kern_mount(&tmpfs_fs_type);
-	if (IS_ERR (res)) {
-		printk (KERN_ERR "could not kern_mount tmpfs\n");
-		unregister_filesystem(&tmpfs_fs_type);
-		return PTR_ERR(res);
+	shm_mnt = kern_mount(&tmpfs_fs_type);
+	if (IS_ERR(shm_mnt)) {
+		error = PTR_ERR(shm_mnt);
+		printk(KERN_ERR "Could not kern_mount tmpfs\n");
+		goto out1;
 	}
-	shm_mnt = res;
 
 	/* The internal instance should not do size checking */
-	if ((error = shmem_set_size(SHMEM_SB(res->mnt_sb), ULONG_MAX, ULONG_MAX)))
-		printk (KERN_ERR "could not set limits on internal tmpfs\n");
-
+	shmem_set_size(SHMEM_SB(shm_mnt->mnt_sb), ULONG_MAX, ULONG_MAX);
 	return 0;
-}
 
-static void __exit exit_shmem_fs(void)
-{
+out1:
 #ifdef CONFIG_TMPFS
 	unregister_filesystem(&shmem_fs_type);
+out2:
 #endif
 	unregister_filesystem(&tmpfs_fs_type);
-	mntput(shm_mnt);
+out3:
+	shm_mnt = ERR_PTR(error);
+	return error;
 }
-
-module_init(init_shmem_fs)
-module_exit(exit_shmem_fs)
+module_init(init_tmpfs)
 
 /*
- * shmem_file_setup - get an unlinked file living in shmem fs
+ * shmem_file_setup - get an unlinked file living in tmpfs
  *
  * @name: name for dentry (to be seen in /proc/<pid>/maps
  * @size: size to be set for the file
  *
  */
-struct file *shmem_file_setup(char * name, loff_t size)
+struct file *shmem_file_setup(char *name, loff_t size)
 {
 	int error;
 	struct file *file;
-	struct inode * inode;
+	struct inode *inode;
 	struct dentry *dentry, *root;
 	struct qstr this;
 	int vm_enough_memory(long pages);
 
+	if (IS_ERR(shm_mnt))
+		return (void *)shm_mnt;
+
 	if (size > SHMEM_MAX_BYTES)
 		return ERR_PTR(-EINVAL);
 
@@ -1517,7 +1637,7 @@
 
 	error = -ENOSPC;
 	inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
-	if (!inode) 
+	if (!inode)
 		goto close_file;
 
 	d_instantiate(dentry, inode);
@@ -1527,13 +1647,13 @@
 	file->f_dentry = dentry;
 	file->f_op = &shmem_file_operations;
 	file->f_mode = FMODE_WRITE | FMODE_READ;
-	return(file);
+	return file;
 
 close_file:
 	put_filp(file);
 put_dentry:
-	dput (dentry);
-	return ERR_PTR(error);	
+	dput(dentry);
+	return ERR_PTR(error);
 }
 
 /*
@@ -1545,13 +1665,13 @@
 {
 	struct file *file;
 	loff_t size = vma->vm_end - vma->vm_start;
-	
+
 	file = shmem_file_setup("dev/zero", size);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
 	if (vma->vm_file)
-		fput (vma->vm_file);
+		fput(vma->vm_file);
 	vma->vm_file = file;
 	vma->vm_ops = &shmem_vm_ops;
 	return 0;

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)