Linux page cache

yuxing · 发表于 2018-5-23 08:45:52

　　Overview

Block device layer

page cache

IO scheduer

　　

　　Page cache contains all file I/O data, direct I/O bypasses the page cache.
　　

　　Page cache helps Linux to economize I/O
　　– Read requests can be made faster by adding a read ahead quantity, depending on the
　　historical behavior of file system accesses by applications
　　– Write requests are delayed and data in the page cache can have multiple updates before
　　being written to disk.
　　– Write requests in the page cache can be merged into larger I/O requests
　　But page cache...
　　 – Requires Linux memory pages
　　 – Is not useful when cached data is not exploited
　　Data just only needed once
　　Application buffers data itself
　　 – In Linux does not know which data the application really needs next. It makes only a guess
　　No alternatives if application cannot handle direct I/O
　　

　　Consider to use...
　　direct I/O:
– bypasses the page cache
– is a good choice in all cases where the application does not want Linux to economize I/O and/or where the application buffers larger amount of file contents
async I/O:
– prevents the application from being blocked in the I/O system call until the I/O completes
– allows read merging by Linux in case of using page cache
– can be combined with direct I/O
　　temporary files:
– should not reside on real disks, a ram disk or tmpfs allows fastest access to these files
– they don't need to survive a crash, don't place them on a journaling file system
file system:
– use ext3 and select the appropriate journaling mode (journal, ordered, writeback)
– turning off atime is only suitable if no application makes decisions on "last read" time,consider relatime instead

　　Direct I/O versus Page cache
　　

　　Direct I/O
　　 – Preferable if application caches itself
　　 Application knows best which data is needed again
　　 Application knows which data is most likely needed next
　　 Example database base management systems DBMS
　　 – Preferable if caching makes no sense
　　 Data only needed once
　　Backup and restore
　　Page cache
　　 – Optimizes re-read / write but can be critical
　　 Data written to the page cache but not to disk yet can get lost if data loss cannot easily be handled
　　 – If application cannot handle direct I/O
　　 Typical example is a file server

/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
* moment. Note that we have no way to track which tasks are using
* a page, though if it is a pagecache page, rmap structures can tell us
* who is mapping it.
*
* The objects in struct page are organized in double word blocks in
* order to allows us to use atomic double word operations on portions
* of struct page. That is currently only used by slub but the arrangement
* allows the use of atomic double word operations on the flags/mapping
* and lru list pointers also.
*/
struct page {
/* First double word block */
unsigned long flags;/* Atomic flags, some possibly
* updated asynchronously */
struct address_space *mapping;/* If low bit clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
* memory, low bit is set, and
* it points to anon_vma object:
* see PAGE_MAPPING_ANON below.
*/
/* Second double word */
struct {
union {
pgoff_t index;/* Our offset within mapping. */
void *freelist;/* slub/slob first free object */
bool pfmemalloc;/* If set by the page allocator,
* ALLOC_NO_WATERMARKS was set
* and the low watermark was not
* met implying that the system
* is under some pressure. The
* caller should try ensure
* this page is only used to
* free other pages.
*/
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
pgtable_t pmd_huge_pte; /* protected by page->ptl */
#endif
};
union {
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
/* Used for cmpxchg_double in slub */
unsigned long counters;
#else
/*
* Keep _count separate from slub cmpxchg_double data.
* As the rest of the double word is protected by
* slab_lock but _count is not.
*/
unsigned counters;
#endif
struct {
union {
/*
* Count of ptes mapped in
* mms, to show when page is
* mapped & limit reverse map
* searches.
*
* Used also for tail pages
* refcounting instead of
* _count. Tail pages cannot
* be mapped and keeping the
* tail page _count zero at
* all times guarantees
* get_page_unless_zero() will
* never succeed on tail
* pages.
*/
atomic_t _mapcount;
struct { /* SLUB */
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
int units;/* SLOB */
};
atomic_t _count;/* Usage count, see below. */
};
};
};
/* Third double word block */
union {
struct list_head lru;/* Pageout list, eg. active_list
* protected by zone->lru_lock !
*/
struct {/* slub per cpu partial pages */
struct page *next;/* Next partial slab */
#ifdef CONFIG_64BIT
int pages;/* Nr of partial slabs left */
int pobjects;/* Approximate # of objects */
#else
short int pages;
short int pobjects;
#endif
};
struct list_head list;/* slobs list of pages */
struct slab *slab_page; /* slab fields */
};
/* Remainder is not double word aligned */
union {
unsigned long private;/* Mapping-private opaque data:
* usually used for buffer_heads
* if PagePrivate set; used for
* swp_entry_t if PageSwapCache;
* indicates order in the buddy
* system if PG_buddy is set.
*/
#if USE_SPLIT_PTE_PTLOCKS
#if BLOATED_SPINLOCKS
spinlock_t *ptl;
#else
spinlock_t ptl;
#endif
#endif
struct kmem_cache *slab_cache;/* SL[AU]B: Pointer to slab */
struct page *first_page;/* Compound tail pages */
};
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual;/* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
unsigned long debug_flags;/* Use atomic bitops on this */
#endif
#ifdef CONFIG_KMEMCHECK
/*
* kmemcheck wants to track the status of each byte in a page; this
* is a pointer to such a status block. NULL if not tracked.
*/
void *shadow;
#endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
}
struct address_space {
struct inode*host;/* owner: inode, block_device */
struct radix_tree_rootpage_tree;/* radix tree of all pages */
spinlock_ttree_lock;/* and lock protecting it */
unsigned inti_mmap_writable;/* count VM_SHARED mappings */
struct rb_rooti_mmap;/* tree of private and shared mappings */
struct list_headi_mmap_nonlinear;/*list VM_NONLINEAR mappings */
struct mutexi_mmap_mutex;/* protect tree, count, list */
/* Protected by tree_lock together with the radix tree */
unsigned longnrpages;/* number of total pages */
unsigned longnrshadows;/* number of shadow entries */
pgoff_twriteback_index;/* writeback starts here */
const struct address_space_operations *a_ops;/* methods */
unsigned longflags;/* error bits/gfp mask */
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
spinlock_tprivate_lock;/* for use by the address_space */
struct list_headprivate_list;/* ditto */
void*private_data;/* ditto */
} __attribute__((aligned(sizeof(long))));struct radix_tree_rootpage_tree; page tree根节点　　

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] Linux page cache

浏览过的版块

扫码加入运维网微信交流群