Page cache contains all file I/O data, direct I/O bypasses the page cache.
Page cache helps Linux to economize I/O
– Read requests can be made faster by adding a read ahead quantity, depending on the
historical behavior of file system accesses by applications
– Write requests are delayed and data in the page cache can have multiple updates before
being written to disk.
– Write requests in the page cache can be merged into larger I/O requests But page cache...
– Requires Linux memory pages
– Is not useful when cached data is not exploited
Data just only needed once
Application buffers data itself
– In Linux does not know which data the application really needs next. It makes only a guess No alternatives if application cannot handle direct I/O
Consider to use... direct I/O:
– bypasses the page cache
– is a good choice in all cases where the application does not want Linux to economize I/O and/or where the application buffers larger amount of file contents async I/O:
– prevents the application from being blocked in the I/O system call until the I/O completes
– allows read merging by Linux in case of using page cache
– can be combined with direct I/O temporary files:
– should not reside on real disks, a ram disk or tmpfs allows fastest access to these files
– they don't need to survive a crash, don't place them on a journaling file system file system:
– use ext3 and select the appropriate journaling mode (journal, ordered, writeback)
– turning off atime is only suitable if no application makes decisions on "last read" time,consider relatime instead
Direct I/O versus Page cache
Direct I/O
– Preferable if application caches itself
Application knows best which data is needed again
Application knows which data is most likely needed next
Example database base management systems DBMS
– Preferable if caching makes no sense
Data only needed once
Backup and restore
Page cache
– Optimizes re-read / write but can be critical
Data written to the page cache but not to disk yet can get lost if data loss cannot easily be handled
– If application cannot handle direct I/O
Typical example is a file server
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
* moment. Note that we have no way to track which tasks are using
* a page, though if it is a pagecache page, rmap structures can tell us
* who is mapping it.
*
* The objects in struct page are organized in double word blocks in
* order to allows us to use atomic double word operations on portions
* of struct page. That is currently only used by slub but the arrangement
* allows the use of atomic double word operations on the flags/mapping
* and lru list pointers also.
*/
struct page {
/* First double word block */
unsigned long flags;/* Atomic flags, some possibly
* updated asynchronously */
struct address_space *mapping;/* If low bit clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
* memory, low bit is set, and
* it points to anon_vma object:
* see PAGE_MAPPING_ANON below.
*/
/* Second double word */
struct {
union {
pgoff_t index;/* Our offset within mapping. */
void *freelist;/* slub/slob first free object */
bool pfmemalloc;/* If set by the page allocator,
* ALLOC_NO_WATERMARKS was set
* and the low watermark was not
* met implying that the system
* is under some pressure. The
* caller should try ensure
* this page is only used to
* free other pages.
*/
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
pgtable_t pmd_huge_pte; /* protected by page->ptl */
#endif
};
union {
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
/* Used for cmpxchg_double in slub */
unsigned long counters;
#else
/*
* Keep _count separate from slub cmpxchg_double data.
* As the rest of the double word is protected by
* slab_lock but _count is not.
*/
unsigned counters;
#endif
struct {
union {
/*
* Count of ptes mapped in
* mms, to show when page is
* mapped & limit reverse map
* searches.
*
* Used also for tail pages
* refcounting instead of
* _count. Tail pages cannot
* be mapped and keeping the
* tail page _count zero at
* all times guarantees
* get_page_unless_zero() will
* never succeed on tail
* pages.
*/
atomic_t _mapcount;
struct { /* SLUB */
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
int units;/* SLOB */
};
atomic_t _count;/* Usage count, see below. */
};
};
};
/* Third double word block */
union {
struct list_head lru;/* Pageout list, eg. active_list
* protected by zone->lru_lock !
*/
struct {/* slub per cpu partial pages */
struct page *next;/* Next partial slab */
#ifdef CONFIG_64BIT
int pages;/* Nr of partial slabs left */
int pobjects;/* Approximate # of objects */
#else
short int pages;
short int pobjects;
#endif
};
struct list_head list;/* slobs list of pages */
struct slab *slab_page; /* slab fields */
};
/* Remainder is not double word aligned */
union {
unsigned long private;/* Mapping-private opaque data:
* usually used for buffer_heads
* if PagePrivate set; used for
* swp_entry_t if PageSwapCache;
* indicates order in the buddy
* system if PG_buddy is set.
*/
#if USE_SPLIT_PTE_PTLOCKS
#if BLOATED_SPINLOCKS
spinlock_t *ptl;
#else
spinlock_t ptl;
#endif
#endif
struct kmem_cache *slab_cache;/* SL[AU]B: Pointer to slab */
struct page *first_page;/* Compound tail pages */
};
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual;/* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
unsigned long debug_flags;/* Use atomic bitops on this */
#endif
#ifdef CONFIG_KMEMCHECK
/*
* kmemcheck wants to track the status of each byte in a page; this
* is a pointer to such a status block. NULL if not tracked.
*/
void *shadow;
#endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
}
struct address_space {
struct inode*host;/* owner: inode, block_device */
struct radix_tree_rootpage_tree;/* radix tree of all pages */
spinlock_ttree_lock;/* and lock protecting it */
unsigned inti_mmap_writable;/* count VM_SHARED mappings */
struct rb_rooti_mmap;/* tree of private and shared mappings */
struct list_headi_mmap_nonlinear;/*list VM_NONLINEAR mappings */
struct mutexi_mmap_mutex;/* protect tree, count, list */
/* Protected by tree_lock together with the radix tree */
unsigned longnrpages;/* number of total pages */
unsigned longnrshadows;/* number of shadow entries */
pgoff_twriteback_index;/* writeback starts here */
const struct address_space_operations *a_ops;/* methods */
unsigned longflags;/* error bits/gfp mask */
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
spinlock_tprivate_lock;/* for use by the address_space */
struct list_headprivate_list;/* ditto */
void*private_data;/* ditto */
} __attribute__((aligned(sizeof(long))));struct radix_tree_rootpage_tree; page tree根节点