準(zhǔn)備
本文所涉及的源碼全部基于linux內(nèi)核5.15毯盈。
ftrace
由于系統(tǒng)調(diào)用的路徑比較復(fù)雜扮饶,當(dāng)我們首次閱讀內(nèi)核代碼不知道從哪里尋找系統(tǒng)調(diào)用函數(shù)入口的時候,可以通過ftrace跟蹤系統(tǒng)函數(shù)的調(diào)用棧來獲取函數(shù)的調(diào)用鏈漏策。
使用ftrace來跟蹤read系統(tǒng)調(diào)用. 以下腳本默認(rèn)機器已經(jīng)掛載了debugfs胁镐。具體的ftrace使用可參考 https://01.org/linuxgraphics/gfx-docs/drm/trace/ftrace.html
cd /sys/kernel/debug/tracing # 為了方便操作先進入該目錄,通過ls該目錄即可查看當(dāng)前系統(tǒng)是否支持ftrace
echo 0 > tracing_on # 先關(guān)閉trace霸株,減少其他trace的干擾
echo function_graph > current_tracer # 設(shè)置trace為func_graph
echo __x64_sys_read > set_graph_function # 跟蹤sys_read系統(tǒng)調(diào)用雕沉,在x64系統(tǒng)下sys_read的實際函數(shù)為__x64_sys_read
echo 1 > tracing_on # 開啟系統(tǒng)調(diào)用
head -30 trace # 查看trace跟蹤信息
比如下圖為開啟ftrace時sys_read在xfs文件系統(tǒng)下的調(diào)用站內(nèi)核調(diào)用棧.從調(diào)用棧中可以得知在new_sync_read中會調(diào)用xfs_file_read_iter也即具體的文件系統(tǒng)的實現(xiàn)。后續(xù)的源碼閱讀會基于ext4文件系統(tǒng)做說明去件,下面的調(diào)用棧只是使用測試機器的一個例子坡椒。
__x64_sys_read() {
ksys_read() {
__fdget_pos() {
__fget_light();
}
vfs_read() {
rw_verify_area() {
security_file_permission() {
apparmor_file_permission() {
common_file_perm() {
aa_file_perm() {
rcu_read_unlock_strict();
}
}
}
__fsnotify_parent();
}
}
new_sync_read() {
xfs_file_read_iter [xfs]() {
xfs_file_buffered_aio_read [xfs]() {
xfs_ilock [xfs]() {
down_read() {
_cond_resched() {
rcu_all_qs();
}
}
}
generic_file_read_iter() {
generic_file_buffered_read() {
_cond_resched() {
rcu_all_qs();
}
pagecache_get_page() {
find_get_entry() {
rcu_read_unlock_strict();
}
PageHuge();
}
mark_page_accessed();
_cond_resched() {
rcu_all_qs();
}
touch_atime() {
atime_needs_update();
}
}
}
xfs_iunlock [xfs]() {
up_read();
}
}
}
}
__fsnotify_parent();
}
系統(tǒng)調(diào)用sys_read
當(dāng)用戶調(diào)用系統(tǒng)調(diào)用read從文件讀取數(shù)據(jù)時,實際會觸發(fā)0x80中斷尤溜,中斷處理程序根據(jù)中斷號找到內(nèi)核read的入口函數(shù) sys_read倔叼。sys_read 的函數(shù)定義如下,SYSCALL_DEFINEx 是內(nèi)核的系統(tǒng)調(diào)用宏定義宫莱,x表示參數(shù)的個數(shù)丈攒,例如sys_read的宏定義為 SYSCALL_DEFINE3(read,int,char*,size_t)表示sys_read有三個參數(shù)。
SYSCALL_DEFINE3(read, unsigned int,fd, char__user *, buf,size_t, count)
{
return ksys_read(fd, buf, count);
}
sys_call實際調(diào)用的是ksys_read梢睛。
ksys_read ksys_read首先會根據(jù)fd拿到struct fd信息肥印。判斷fd是否存在,fd如果錯誤直接返回EBADF錯誤碼绝葡。ksys_read的核心流程為vfs_read深碱。通過虛擬文件系統(tǒng)的vfs_read來完成文件的讀取,對外屏蔽了不同文件系統(tǒng)的具體實現(xiàn)藏畅。
ssize_t ksys_read(unsigned intfd, char__user *buf,size_t count)
{
struct fd f =fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_tpos, *ppos =file_ppos(f.file); // 獲取當(dāng)前文件的offset
if (ppos) {
pos = *ppos;
ppos = &pos;
}
ret =vfs_read(f.file, buf, count,ppos); // 調(diào)用虛擬文件系統(tǒng)vfs_read進行讀取
if (ret >= 0 && ppos)
f.file->f_pos =pos;
fdput_pos(f);
}
return ret;
}
虛擬文件系統(tǒng)
vfs_read 為虛擬文件系統(tǒng)的讀操作實現(xiàn)敷硅,函數(shù)內(nèi)部會根據(jù)具體的文件系統(tǒng)實現(xiàn)調(diào)用對應(yīng)的讀操作。
ssize_t vfs_read(struct file *file, char__user *buf,size_t count,loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode &FMODE_READ))
return -EBADF;
if (!(file->f_mode &FMODE_CAN_READ))
return -EINVAL;
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
ret = rw_verify_area(READ,file,pos, count);
if (ret)
return ret;
if (count >MAX_RW_COUNT)
count =MAX_RW_COUNT;
if (file->f_op->read)// 判斷文件系統(tǒng)是否實現(xiàn)了read接口
ret =file->f_op->read(file, buf, count,pos);
else if (file->f_op->read_iter) // 判斷文件系統(tǒng)是否實現(xiàn)了read_iter接口
ret = new_sync_read(file, buf, count,pos);
else
ret = -EINVAL;
if (ret > 0) {
fsnotify_access(file);
add_rchar(current, ret);
}
inc_syscr(current);
return ret;
}
vfs_read涉及到一個核心的數(shù)據(jù)結(jié)構(gòu) struct **[file](<https://elixir.bootlin.com/linux/v5.15/C/ident/file>)
** 為fd對應(yīng)的文件句柄的實現(xiàn)愉阎,file包含了文件權(quán)限绞蹦,inode等信息,此處需要重點介紹的是file_operations榜旦, file_operations定義了一系列的文件操作實現(xiàn)的函數(shù)指針幽七,不同文件系統(tǒng)通過實現(xiàn)該系列函數(shù)指針來實現(xiàn)具體文件系統(tǒng)的io操作。
struct file {
union {
struct llist_node fu_llist;
struct rcu_head fu_rcuhead;
} f_u;
struct path f_path;
struct inode *f_inode; /* cached value */
const struct file_operations *f_op; // 文件io操作函數(shù)指針定義
/*
* Protects f_ep, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
enum rw_hint f_write_hint;
atomic_long_t f_count;
unsigned int f_flags;
fmode_t f_mode;
struct mutex f_pos_lock;
loff_t f_pos;
struct fown_struct f_owner;
const struct cred *f_cred;
struct file_ra_state f_ra;
u64 f_version;
#ifdef CONFIG_SECURITY
void *f_security;
#endif
/* needed for tty driver, and maybe others */
void *private_data;
#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct hlist_head *f_ep;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping; // 文件映射在物理內(nèi)存的page
errseq_t f_wb_err;
errseq_t f_sb_err; /* for syncfs */
} __randomize_layout
__attribute__((aligned(4)));
file_operations file_operations 定義了一些列文件操作的函數(shù)指針溅呢,包括seek read write open flush等等澡屡。
struct file_operations {
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iopoll)(struct kiocb *kiocb, bool spin);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
unsigned long mmap_supported_flags;
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
int (*fsync) (struct file *, loff_t, loff_t, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
loff_t, size_t, unsigned int);
loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout;
ext4文件系統(tǒng)
vfs_read的實現(xiàn)里通過 file→f_op的判斷來獲取對應(yīng)的文件io實現(xiàn)方法,以ext4文件系統(tǒng)為例咐旧,ext4文件系統(tǒng)實現(xiàn)了read_iter方法驶鹉,因此ext4 系統(tǒng)下sys_read實際會調(diào)用 new_sync_read。
if (file->f_op->read)// 判斷文件系統(tǒng)是否實現(xiàn)了read接口
ret =file->f_op->read(file, buf, count,pos);
else if (file->f_op->read_iter) // 判斷文件系統(tǒng)是否實現(xiàn)了read_iter接口
ret = new_sync_read(file, buf, count,pos);
else
ret = -EINVAL;
ext4_file_read_iter ext4_file_operations 為ext4 文件操作接口的具體實現(xiàn)铣墨,可以看到ext4只實現(xiàn)了read_iter方法沒有實現(xiàn)read方法
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter =ext4_file_write_iter,
.iopoll =iomap_dio_iopoll,
.unlocked_ioctl =ext4_ioctl,
#ifdef CONFIG_COMPAT.
compat_ioctl =ext4_compat_ioctl,
#endif
.mmap =ext4_file_mmap,
.mmap_supported_flags =MAP_SYNC,
.open =ext4_file_open,
.release =ext4_release_file,
.fsync =ext4_sync_file,
.get_unmapped_area =thp_get_unmapped_area,
.splice_read =generic_file_splice_read,
.splice_write =iter_file_splice_write,
.fallocate =ext4_fallocate,
};
由于ext4實現(xiàn)了read_iter接口室埋,因此vfs_read的實際調(diào)用為 new_sync_read
statics size_t new_sync_read(structfile *filp, char__user *buf,size_t len,loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb,filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
iov_iter_init(&iter,READ, &iov, 1, len);
ret =call_read_iter(filp, &kiocb, &iter);// 調(diào)用具體的read_iter實現(xiàn)
BUG_ON(ret == -EIOCBQUEUED);
if (ppos)
*ppos =kiocb.ki_pos;
return ret;
}
new_sync_read首先會初始化iovec結(jié)構(gòu), 然后調(diào)用 call_read_iter進行文件的io讀取。
call_read_iter 定義在include/linux/fs.h 姚淆,此處則通過f_op→read_iter()孕蝉,調(diào)用具體的文件系統(tǒng)的實現(xiàn)。
static inline ssize_t call_read_iter(structfile *file, structkiocb *kio,
structiov_iter *iter)
{
returnfile->f_op->read_iter(kio,iter);// 此處調(diào)用read_iter具體的文件系統(tǒng)實現(xiàn)肉盹,根據(jù)上面講的昔驱,調(diào)用的具體實現(xiàn)即 ext4_file_read_iter
}
對應(yīng)上文說到的,ext4文件系統(tǒng)的實現(xiàn)為 ext4_file_read_iter
statics size_t ext4_file_read_iter(structkiocb *iocb, structiov_iter *to)
{
structinode *inode =file_inode(iocb->ki_filp);
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
if (!iov_iter_count(to))
return 0;/* skip atime */
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
return ext4_dax_read_iter(iocb,to);
#endif
if (iocb->ki_flags &IOCB_DIRECT)
return ext4_dio_read_iter(iocb,to);
return generic_file_read_iter(iocb,to); // 不考慮dio的情況上忍,此處調(diào)用了系統(tǒng)默認(rèn)的讀取實現(xiàn)翼闹。
}
ext4_file_read_iter會判斷文件系統(tǒng)是否掛載了fs_dax參數(shù)搂抒,fs_dax的含義此處不做深入介紹既绕,本文只介紹常用情況下ext4的讀操作流程务热。正常的讀操作流程其實是調(diào)用了vfs的默認(rèn)實現(xiàn)
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
size_t count = iov_iter_count(iter);
ssize_t retval = 0;
if (!count)
return 0; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) { // 判斷是否要進行DIO
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping; // 物理文件映射到內(nèi)存page
struct inode *inode = mapping->host;
loff_t size;
size = i_size_read(inode);
if (iocb->ki_flags & IOCB_NOWAIT) {
if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
iocb->ki_pos + count - 1))
return -EAGAIN;
} else {
retval = filemap_write_and_wait_range(mapping,
iocb->ki_pos,
iocb->ki_pos + count - 1);
if (retval < 0)
return retval;
}
file_accessed(file);
retval = mapping->a_ops->direct_IO(iocb, iter);
if (retval >= 0) {
iocb->ki_pos += retval;
count -= retval;
}
if (retval != -EIOCBQUEUED)
iov_iter_revert(iter, count - iov_iter_count(iter));
/*
* Btrfs can have a short DIO read if we encounter
* compressed extents, so if there was an error, or if
* we've already read everything we wanted to, or if
* there was a short read because we hit EOF, go ahead
* and return. Otherwise fallthrough to buffered io for
* the rest of the read. Buffered reads will not work for
* DAX files, so don't bother trying.
*/
if (retval < 0 || !count || iocb->ki_pos >= size ||
IS_DAX(inode))
return retval;
}
return filemap_read(iocb, iter, retval); // 不考慮dio的場景,實際調(diào)用為此處
}
該函數(shù)會判斷是否設(shè)置了dio吓笙,dio此處不做深入解析淑玫,直接看filemap_read的具體實現(xiàn)。
page cache
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
ssize_t already_read)
{
struct file *filp = iocb->ki_filp;
struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct pagevec pvec;
int i, error = 0;
bool writably_mapped;
loff_t isize, end_offset;
if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
if (unlikely(!iov_iter_count(iter)))
return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
pagevec_init(&pvec);
do {
cond_resched();
/*
* If we've already successfully copied some data, then we
* can no longer safely return -EIOCBQUEUED. Hence mark
* an async read NOWAIT at that point.
*/
if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
iocb->ki_flags |= IOCB_NOWAIT;
error = filemap_get_pages(iocb, iter, &pvec); // 獲取page cache
if (error < 0)
break;
/*
* i_size must be checked after we know the pages are Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode);
if (unlikely(iocb->ki_pos >= isize))
goto put_pages;
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
/*
* Once we start copying data, we don't want to be touching any
* cachelines that might be contended:
*/
writably_mapped = mapping_writably_mapped(mapping);
/*
* When a sequential read accesses a page several times, only
* mark it as accessed the first time.
*/
if (iocb->ki_pos >> PAGE_SHIFT !=
ra->prev_pos >> PAGE_SHIFT)
mark_page_accessed(pvec.pages[0]);
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
size_t page_size = thp_size(page);
size_t offset = iocb->ki_pos & (page_size - 1);
size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
page_size - offset);
size_t copied;
if (end_offset < page_offset(page))
break;
if (i > 0)
mark_page_accessed(page);
/*
* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (writably_mapped) {
int j;
for (j = 0; j < thp_nr_pages(page); j++)
flush_dcache_page(page + j);
}
copied = copy_page_to_iter(page, offset, bytes, iter);
already_read += copied;
iocb->ki_pos += copied;
ra->prev_pos = iocb->ki_pos;
if (copied < bytes) {
error = -EFAULT;
break;
}
}
put_pages:
for (i = 0; i < pagevec_count(&pvec); i++)
put_page(pvec.pages[i]); // 將page放入page cache緩存
pagevec_reinit(&pvec);
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
file_accessed(filp);
return already_read ? already_read : error;
}
當(dāng)我們從文件讀取數(shù)據(jù)時面睛,在非dio的場景下絮蒿,往往是先判斷文件對應(yīng)的page是否存在page cache中,如果存在并且當(dāng)前的cache不是dirty的那么就可以直接從page cache讀取叁鉴,通過page cache可以大大提升文件的讀寫性能土涝,page cache的讀取具體實現(xiàn)細(xì)節(jié)則在 filemap_get_pages
static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
struct pagevec *pvec)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index;
struct page *page;
int err = 0;
last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
retry:
if (fatal_signal_pending(current))
return -EINTR;
filemap_get_read_batch(mapping, index, last_index, pvec); // 批量獲取page
if (!pagevec_count(pvec)) {
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
page_cache_sync_readahead(mapping, ra, filp, index,
last_index - index);
filemap_get_read_batch(mapping, index, last_index, pvec);
}
if (!pagevec_count(pvec)) {
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
return -EAGAIN;
err = filemap_create_page(filp, mapping,
iocb->ki_pos >> PAGE_SHIFT, pvec); //在cache中不存在,觸發(fā)缺頁處理幌墓,讀取磁盤
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
}
page = pvec->pages[pagevec_count(pvec) - 1];
if (PageReadahead(page)) {
err = filemap_readahead(iocb, filp, mapping, page, last_index); // 是否進行預(yù)讀
// 在順序io的情況下但壮,通過預(yù)判進行預(yù)讀可以提升下一次讀取的性能,減少磁盤io
if (err)
goto err;
}
if (!PageUptodate(page)) {
if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1)
iocb->ki_flags |= IOCB_NOWAIT;
err = filemap_update_page(iocb, mapping, iter, page);
if (err)
goto err;
}
return 0;
err:
if (err < 0)
put_page(page);
if (likely(--pvec->nr))
return 0;
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
}
當(dāng)文件的page不存在page cache中常侣,則會觸發(fā)缺頁蜡饵,進入缺頁處理函數(shù) filemap_create_page
static int filemap_create_page(struct file *file,
struct address_space *mapping, pgoff_t index,
struct pagevec *pvec)
{
struct page *page;
int error;
page = page_cache_alloc(mapping); // 分配pagecache
if (!page)
return -ENOMEM;
/*
* Protect against truncate / hole punch. Grabbing invalidate_lock here
* assures we cannot instantiate and bring uptodate new pagecache pages
* after evicting page cache during truncate and before actually
* freeing blocks. Note that we could release invalidate_lock after
* inserting the page into page cache as the locked page would then be
* enough to synchronize with hole punching. But there are code paths
* such as filemap_update_page() filling in partially uptodate pages or
* ->readpages() that need to hold invalidate_lock while mapping blocks
* for IO so let's hold the lock here as well to keep locking rules
* simple.
*/
filemap_invalidate_lock_shared(mapping);
error = add_to_page_cache_lru(page, mapping, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));// 添加到lru pagecache
if (error == -EEXIST)
error = AOP_TRUNCATED_PAGE;
if (error)
goto error;
error = filemap_read_page(file, mapping, page); // 讀取pagecache
if (error)
goto error;
filemap_invalidate_unlock_shared(mapping);
pagevec_add(pvec, page);
return 0;
error:
filemap_invalidate_unlock_shared(mapping);
put_page(page);
return error;
}
缺頁處理函數(shù)首先會分配內(nèi)存page,分配內(nèi)存page的實現(xiàn)為 page_alloc_cache 胳施,此處通過內(nèi)核的內(nèi)存分配伙伴系統(tǒng)分配一個page溯祸,伙伴系統(tǒng)的詳細(xì)實現(xiàn)本文先不做深入探討。
#ifdef CONFIG_NUMA
extern struct page *__page_cache_alloc(gfp_t gfp);
#else
static inline struct page *__page_cache_alloc(gfp_t gfp)
{
return alloc_pages(gfp, 0);
}
#endif
static inline struct page *page_cache_alloc(struct address_space *x)
{
return __page_cache_alloc(mapping_gfp_mask(x));
}
[alloc_pages] (https://elixir.bootlin.com/linux/v5.15/source/include/linux/gfp.h#L588)
#ifdef CONFIG_NUMA
struct page *alloc_pages(gfp_t gfp, unsigned int order);
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
struct vm_area_struct *vma, unsigned long addr,
int node, bool hugepage);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \\
alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
#else
static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
{
return alloc_pages_node(numa_node_id(), gfp_mask, order);
}
分配完page后舞肆,則從磁盤讀取文件page filemap_read_page
static int filemap_read_page(struct file *file, struct address_space *mapping,
struct page *page)
{
int error;
/*
* A previous I/O error may have been due to temporary failures,
* eg. multipath errors. PG_error will be set again if readpage
* fails.
*/
ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(file, page);// readpage 為函數(shù)指針您没,對應(yīng)到ext4的實現(xiàn)為
//
if (error)
return error;
error = wait_on_page_locked_killable(page);
if (error)
return error;
if (PageUptodate(page))
return 0;
shrink_readahead_size_eio(&file->f_ra);
return -EIO;
}
從物理文件讀取page的核心定義為 a_ops 該結(jié)構(gòu)體定義了從物理文件讀取page的一系列函數(shù),不同的文件系統(tǒng)對應(yīng)到具體不同的函數(shù)實現(xiàn)胆绊。
struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*readpage)(struct file *, struct page *);
/* Write back some dirty pages from this mapping. */
int (*writepages)(struct address_space *, struct writeback_control *);
/* Set a page dirty. Return true if this dirtied it */
int (*set_page_dirty)(struct page *page);
/*
* Reads in the requested pages. Unlike ->readpage(), this is
* PURELY used for read-ahead!.
*/
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
void (*readahead)(struct readahead_control *);
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
int (*write_end)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned int, unsigned int);
int (*releasepage) (struct page *, gfp_t);
void (*freepage)(struct page *);
ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
/*
* migrate the contents of a page to the specified target. If
* migrate_mode is MIGRATE_ASYNC, it must not block.
*/
int (*migratepage) (struct address_space *,
struct page *, struct page *, enum migrate_mode);
bool (*isolate_page)(struct page *, isolate_mode_t);
void (*putback_page)(struct page *);
int (*launder_page) (struct page *);
int (*is_partially_uptodate) (struct page *, unsigned long,
unsigned long);
void (*is_dirty_writeback) (struct page *, bool *, bool *);
int (*error_remove_page)(struct address_space *, struct page *);
/* swapfile support */
int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
sector_t *span);
void (*swap_deactivate)(struct file *file);
};
同樣以ext4為例,ext4 a_ops具體定義實現(xiàn) 相對于的readpage實現(xiàn)為ext4_readpage
static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
.set_page_dirty = ext4_set_page_dirty,
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
.direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.swap_activate = ext4_iomap_swap_activate,
};
static int ext4_readpage(struct file *file, struct page *page)
{
int ret = -EAGAIN;
struct inode *inode = page->mapping->host;
trace_ext4_readpage(page);
if (ext4_has_inline_data(inode))
ret = ext4_readpage_inline(inode, page);
if (ret == -EAGAIN)
return ext4_mpage_readpages(inode, NULL, page);
return ret;
}
ext4_mpage_readpages則構(gòu)造bio請求從塊設(shè)備讀取數(shù)據(jù) 欧募,通過構(gòu)造bio將任務(wù)提交到io調(diào)度器压状,從而向塊設(shè)備驅(qū)動提交讀請求。至此,用戶發(fā)起讀系統(tǒng)調(diào)用請求真正進入磁盤塊設(shè)備讀取物理文件數(shù)據(jù)种冬。
reference:
- SYSCALL_DEFINE https://blog.csdn.net/hxmhyp/article/details/22699669