【linux內(nèi)核源碼】 io操作之read

準(zhǔn)備

本文所涉及的源碼全部基于linux內(nèi)核5.15毯盈。

ftrace

由于系統(tǒng)調(diào)用的路徑比較復(fù)雜扮饶,當(dāng)我們首次閱讀內(nèi)核代碼不知道從哪里尋找系統(tǒng)調(diào)用函數(shù)入口的時候,可以通過ftrace跟蹤系統(tǒng)函數(shù)的調(diào)用棧來獲取函數(shù)的調(diào)用鏈漏策。

使用ftrace來跟蹤read系統(tǒng)調(diào)用. 以下腳本默認(rèn)機器已經(jīng)掛載了debugfs胁镐。具體的ftrace使用可參考 https://01.org/linuxgraphics/gfx-docs/drm/trace/ftrace.html

cd /sys/kernel/debug/tracing # 為了方便操作先進入該目錄,通過ls該目錄即可查看當(dāng)前系統(tǒng)是否支持ftrace
echo 0 > tracing_on # 先關(guān)閉trace霸株,減少其他trace的干擾
echo function_graph > current_tracer # 設(shè)置trace為func_graph
echo  __x64_sys_read  > set_graph_function # 跟蹤sys_read系統(tǒng)調(diào)用雕沉,在x64系統(tǒng)下sys_read的實際函數(shù)為__x64_sys_read
echo 1 > tracing_on # 開啟系統(tǒng)調(diào)用
head -30 trace # 查看trace跟蹤信息

比如下圖為開啟ftrace時sys_read在xfs文件系統(tǒng)下的調(diào)用站內(nèi)核調(diào)用棧.從調(diào)用棧中可以得知在new_sync_read中會調(diào)用xfs_file_read_iter也即具體的文件系統(tǒng)的實現(xiàn)。后續(xù)的源碼閱讀會基于ext4文件系統(tǒng)做說明去件,下面的調(diào)用棧只是使用測試機器的一個例子坡椒。

__x64_sys_read() {
  ksys_read() {
    __fdget_pos() {
    __fget_light();
    }       
        vfs_read() {
          rw_verify_area() {
            security_file_permission() {
              apparmor_file_permission() {
                common_file_perm() {
                  aa_file_perm() {
                    rcu_read_unlock_strict();
                  }
                }
              }
              __fsnotify_parent();
            }
          }
          new_sync_read() {
            xfs_file_read_iter [xfs]() {
              xfs_file_buffered_aio_read [xfs]() {
                xfs_ilock [xfs]() {
                  down_read() {
                    _cond_resched() {
                      rcu_all_qs();
                    }
                  }
                }
                generic_file_read_iter() {
                  generic_file_buffered_read() {
                    _cond_resched() {
                      rcu_all_qs();
                    }
                    pagecache_get_page() {
                      find_get_entry() {
                        rcu_read_unlock_strict();
                      }
                      PageHuge();
                    }
                    mark_page_accessed();
                    _cond_resched() {
                      rcu_all_qs();
                    }
                    touch_atime() {
                      atime_needs_update();
                    }
                  }
                }
                xfs_iunlock [xfs]() {
                  up_read();
                }
              }
            }
          }
          __fsnotify_parent();
        }

系統(tǒng)調(diào)用sys_read

當(dāng)用戶調(diào)用系統(tǒng)調(diào)用read從文件讀取數(shù)據(jù)時,實際會觸發(fā)0x80中斷尤溜,中斷處理程序根據(jù)中斷號找到內(nèi)核read的入口函數(shù) sys_read倔叼。sys_read 的函數(shù)定義如下,SYSCALL_DEFINEx 是內(nèi)核的系統(tǒng)調(diào)用宏定義宫莱,x表示參數(shù)的個數(shù)丈攒,例如sys_read的宏定義為 SYSCALL_DEFINE3(read,int,char*,size_t)表示sys_read有三個參數(shù)。

sys_read

SYSCALL_DEFINE3(read, unsigned int,fd, char__user *, buf,size_t, count)
{
    return ksys_read(fd, buf, count);
}

sys_call實際調(diào)用的是ksys_read梢睛。

ksys_read ksys_read首先會根據(jù)fd拿到struct fd信息肥印。判斷fd是否存在,fd如果錯誤直接返回EBADF錯誤碼绝葡。ksys_read的核心流程為vfs_read深碱。通過虛擬文件系統(tǒng)的vfs_read來完成文件的讀取,對外屏蔽了不同文件系統(tǒng)的具體實現(xiàn)藏畅。

ssize_t ksys_read(unsigned intfd, char__user *buf,size_t count)
{
    struct fd f =fdget_pos(fd);
    ssize_t ret = -EBADF;
    if (f.file) {
        loff_tpos, *ppos =file_ppos(f.file); // 獲取當(dāng)前文件的offset
        if (ppos) {
            pos = *ppos;
            ppos = &pos;
        }
        ret =vfs_read(f.file, buf, count,ppos); // 調(diào)用虛擬文件系統(tǒng)vfs_read進行讀取
        if (ret >= 0 && ppos)
            f.file->f_pos =pos;
            fdput_pos(f);
    }
    return ret;
}

虛擬文件系統(tǒng)

vfs_read 為虛擬文件系統(tǒng)的讀操作實現(xiàn)敷硅,函數(shù)內(nèi)部會根據(jù)具體的文件系統(tǒng)實現(xiàn)調(diào)用對應(yīng)的讀操作。

ssize_t vfs_read(struct file *file, char__user *buf,size_t count,loff_t *pos)
{
ssize_t ret;

    if (!(file->f_mode &FMODE_READ))
        return -EBADF;
    if (!(file->f_mode &FMODE_CAN_READ))
        return -EINVAL;
    if (unlikely(!access_ok(buf, count)))
        return -EFAULT;

    ret = rw_verify_area(READ,file,pos, count);
    if (ret)
        return ret;
    if (count >MAX_RW_COUNT)
        count =MAX_RW_COUNT;

    if (file->f_op->read)// 判斷文件系統(tǒng)是否實現(xiàn)了read接口
        ret =file->f_op->read(file, buf, count,pos);
    else if (file->f_op->read_iter) // 判斷文件系統(tǒng)是否實現(xiàn)了read_iter接口
        ret = new_sync_read(file, buf, count,pos);
    else
        ret = -EINVAL;
    if (ret > 0) {
fsnotify_access(file);
add_rchar(current, ret);
    }
inc_syscr(current);
    return ret;
}

vfs_read涉及到一個核心的數(shù)據(jù)結(jié)構(gòu) struct **[file](<https://elixir.bootlin.com/linux/v5.15/C/ident/file>)** 為fd對應(yīng)的文件句柄的實現(xiàn)愉阎,file包含了文件權(quán)限绞蹦,inode等信息,此處需要重點介紹的是file_operations榜旦, file_operations定義了一系列的文件操作實現(xiàn)的函數(shù)指針幽七,不同文件系統(tǒng)通過實現(xiàn)該系列函數(shù)指針來實現(xiàn)具體文件系統(tǒng)的io操作。

struct file {
    union {
        struct llist_node   fu_llist;
        struct rcu_head     fu_rcuhead;
    } f_u;
    struct path     f_path;
    struct inode        *f_inode;   /* cached value */
    const struct file_operations    *f_op; // 文件io操作函數(shù)指針定義

    /*
     * Protects f_ep, f_flags.
     * Must not be taken from IRQ context.
     */
    spinlock_t      f_lock;
    enum rw_hint        f_write_hint;
    atomic_long_t       f_count;
    unsigned int        f_flags;
    fmode_t         f_mode;
    struct mutex        f_pos_lock;
    loff_t          f_pos;
    struct fown_struct  f_owner;
    const struct cred   *f_cred;
    struct file_ra_state    f_ra;

    u64         f_version;
#ifdef CONFIG_SECURITY
    void            *f_security;
#endif
    /* needed for tty driver, and maybe others */
    void            *private_data;

#ifdef CONFIG_EPOLL
    /* Used by fs/eventpoll.c to link all the hooks to this file */
    struct hlist_head   *f_ep;
#endif /* #ifdef CONFIG_EPOLL */
    struct address_space    *f_mapping; // 文件映射在物理內(nèi)存的page
    errseq_t        f_wb_err;
    errseq_t        f_sb_err; /* for syncfs */
} __randomize_layout
  __attribute__((aligned(4)));

file_operations file_operations 定義了一些列文件操作的函數(shù)指針溅呢,包括seek read write open flush等等澡屡。

struct file_operations {
    struct module *owner;
    loff_t (*llseek) (struct file *, loff_t, int);
    ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
    ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
    ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
    ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
    int (*iopoll)(struct kiocb *kiocb, bool spin);
    int (*iterate) (struct file *, struct dir_context *);
    int (*iterate_shared) (struct file *, struct dir_context *);
    __poll_t (*poll) (struct file *, struct poll_table_struct *);
    long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
    long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
    int (*mmap) (struct file *, struct vm_area_struct *);
    unsigned long mmap_supported_flags;
    int (*open) (struct inode *, struct file *);
    int (*flush) (struct file *, fl_owner_t id);
    int (*release) (struct inode *, struct file *);
    int (*fsync) (struct file *, loff_t, loff_t, int datasync);
    int (*fasync) (int, struct file *, int);
    int (*lock) (struct file *, int, struct file_lock *);
    ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
    unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
    int (*check_flags)(int);
    int (*flock) (struct file *, int, struct file_lock *);
    ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
    ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
    int (*setlease)(struct file *, long, struct file_lock **, void **);
    long (*fallocate)(struct file *file, int mode, loff_t offset,
              loff_t len);
    void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
    unsigned (*mmap_capabilities)(struct file *);
#endif
    ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
            loff_t, size_t, unsigned int);
    loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                   struct file *file_out, loff_t pos_out,
                   loff_t len, unsigned int remap_flags);
    int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout;

ext4文件系統(tǒng)

vfs_read的實現(xiàn)里通過 file→f_op的判斷來獲取對應(yīng)的文件io實現(xiàn)方法,以ext4文件系統(tǒng)為例咐旧,ext4文件系統(tǒng)實現(xiàn)了read_iter方法驶鹉,因此ext4 系統(tǒng)下sys_read實際會調(diào)用 new_sync_read。

if (file->f_op->read)// 判斷文件系統(tǒng)是否實現(xiàn)了read接口
        ret =file->f_op->read(file, buf, count,pos);
    else if (file->f_op->read_iter) // 判斷文件系統(tǒng)是否實現(xiàn)了read_iter接口
        ret = new_sync_read(file, buf, count,pos);
    else
        ret = -EINVAL;

ext4_file_read_iter ext4_file_operations 為ext4 文件操作接口的具體實現(xiàn)铣墨,可以看到ext4只實現(xiàn)了read_iter方法沒有實現(xiàn)read方法

const struct file_operations ext4_file_operations = {
    .llseek     = ext4_llseek,
    .read_iter  = ext4_file_read_iter,
    .write_iter =ext4_file_write_iter,
    .iopoll     =iomap_dio_iopoll,
    .unlocked_ioctl =ext4_ioctl,
#ifdef CONFIG_COMPAT.
     compat_ioctl   =ext4_compat_ioctl,
#endif
    .mmap       =ext4_file_mmap,
    .mmap_supported_flags =MAP_SYNC,
    .open       =ext4_file_open,
    .release    =ext4_release_file,
    .fsync      =ext4_sync_file,
    .get_unmapped_area =thp_get_unmapped_area,
    .splice_read    =generic_file_splice_read,
    .splice_write   =iter_file_splice_write,
    .fallocate  =ext4_fallocate,
};

由于ext4實現(xiàn)了read_iter接口室埋,因此vfs_read的實際調(diào)用為 new_sync_read

statics size_t new_sync_read(structfile *filp, char__user *buf,size_t len,loff_t *ppos)
{
    struct iovec iov = { .iov_base = buf, .iov_len = len };
    struct kiocb kiocb;
    struct iov_iter iter;
  ssize_t ret;

 init_sync_kiocb(&kiocb,filp);
 kiocb.ki_pos = (ppos ? *ppos : 0);
 iov_iter_init(&iter,READ, &iov, 1, len);

ret =call_read_iter(filp, &kiocb, &iter);// 調(diào)用具體的read_iter實現(xiàn)
BUG_ON(ret == -EIOCBQUEUED);
    if (ppos)
        *ppos =kiocb.ki_pos;
    return ret;
}

new_sync_read首先會初始化iovec結(jié)構(gòu), 然后調(diào)用 call_read_iter進行文件的io讀取。

call_read_iter 定義在include/linux/fs.h 姚淆,此處則通過f_op→read_iter()孕蝉,調(diào)用具體的文件系統(tǒng)的實現(xiàn)。

static inline ssize_t call_read_iter(structfile *file, structkiocb *kio,
                     structiov_iter *iter)
{
    returnfile->f_op->read_iter(kio,iter);// 此處調(diào)用read_iter具體的文件系統(tǒng)實現(xiàn)肉盹,根據(jù)上面講的昔驱,調(diào)用的具體實現(xiàn)即 ext4_file_read_iter
}

對應(yīng)上文說到的,ext4文件系統(tǒng)的實現(xiàn)為 ext4_file_read_iter

statics size_t ext4_file_read_iter(structkiocb *iocb, structiov_iter *to)
{
    structinode *inode =file_inode(iocb->ki_filp);

    if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
        return -EIO;

    if (!iov_iter_count(to))
        return 0;/* skip atime */
#ifdef CONFIG_FS_DAX
    if (IS_DAX(inode))
        return ext4_dax_read_iter(iocb,to);
#endif
    if (iocb->ki_flags &IOCB_DIRECT)
        return ext4_dio_read_iter(iocb,to);

    return generic_file_read_iter(iocb,to); // 不考慮dio的情況上忍,此處調(diào)用了系統(tǒng)默認(rèn)的讀取實現(xiàn)翼闹。
}

ext4_file_read_iter會判斷文件系統(tǒng)是否掛載了fs_dax參數(shù)搂抒,fs_dax的含義此處不做深入介紹既绕,本文只介紹常用情況下ext4的讀操作流程务热。正常的讀操作流程其實是調(diào)用了vfs的默認(rèn)實現(xiàn)

generic_file_read_iter

ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
    size_t count = iov_iter_count(iter);
    ssize_t retval = 0;

    if (!count)
        return 0; /* skip atime */

    if (iocb->ki_flags & IOCB_DIRECT) { // 判斷是否要進行DIO
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping; // 物理文件映射到內(nèi)存page
        struct inode *inode = mapping->host;
        loff_t size;

        size = i_size_read(inode);
        if (iocb->ki_flags & IOCB_NOWAIT) {
            if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
                        iocb->ki_pos + count - 1))
                return -EAGAIN;
        } else {
            retval = filemap_write_and_wait_range(mapping,
                        iocb->ki_pos,
                            iocb->ki_pos + count - 1);
            if (retval < 0)
                return retval;
        }

        file_accessed(file);

        retval = mapping->a_ops->direct_IO(iocb, iter);
        if (retval >= 0) {
            iocb->ki_pos += retval;
            count -= retval;
        }
        if (retval != -EIOCBQUEUED)
            iov_iter_revert(iter, count - iov_iter_count(iter));

        /*
         * Btrfs can have a short DIO read if we encounter
         * compressed extents, so if there was an error, or if
         * we've already read everything we wanted to, or if
         * there was a short read because we hit EOF, go ahead
         * and return.  Otherwise fallthrough to buffered io for
         * the rest of the read.  Buffered reads will not work for
         * DAX files, so don't bother trying.
         */
        if (retval < 0 || !count || iocb->ki_pos >= size ||
            IS_DAX(inode))
            return retval;
    }

    return filemap_read(iocb, iter, retval); // 不考慮dio的場景,實際調(diào)用為此處
}

該函數(shù)會判斷是否設(shè)置了dio吓笙,dio此處不做深入解析淑玫,直接看filemap_read的具體實現(xiàn)。

page cache

filemap_read

ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
        ssize_t already_read)
{
    struct file *filp = iocb->ki_filp;
    struct file_ra_state *ra = &filp->f_ra;
    struct address_space *mapping = filp->f_mapping;
    struct inode *inode = mapping->host;
    struct pagevec pvec;
    int i, error = 0;
    bool writably_mapped;
    loff_t isize, end_offset;

    if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
        return 0;
    if (unlikely(!iov_iter_count(iter)))
        return 0;

    iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
    pagevec_init(&pvec);

    do {
        cond_resched();

        /*
         * If we've already successfully copied some data, then we
         * can no longer safely return -EIOCBQUEUED. Hence mark
         * an async read NOWAIT at that point.
         */
        if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
            iocb->ki_flags |= IOCB_NOWAIT;

        error = filemap_get_pages(iocb, iter, &pvec); // 獲取page cache
        if (error < 0)
            break;

        /*
         * i_size must be checked after we know the pages are Uptodate.
         *
         * Checking i_size after the check allows us to calculate
         * the correct value for "nr", which means the zero-filled
         * part of the page is not copied back to userspace (unless
         * another truncate extends the file - this is desired though).
         */
        isize = i_size_read(inode);
        if (unlikely(iocb->ki_pos >= isize))
            goto put_pages;
        end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

        /*
         * Once we start copying data, we don't want to be touching any
         * cachelines that might be contended:
         */
        writably_mapped = mapping_writably_mapped(mapping);

        /*
         * When a sequential read accesses a page several times, only
         * mark it as accessed the first time.
         */
        if (iocb->ki_pos >> PAGE_SHIFT !=
            ra->prev_pos >> PAGE_SHIFT)
            mark_page_accessed(pvec.pages[0]);

        for (i = 0; i < pagevec_count(&pvec); i++) {
            struct page *page = pvec.pages[i];
            size_t page_size = thp_size(page);
            size_t offset = iocb->ki_pos & (page_size - 1);
            size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
                         page_size - offset);
            size_t copied;

            if (end_offset < page_offset(page))
                break;
            if (i > 0)
                mark_page_accessed(page);
            /*
             * If users can be writing to this page using arbitrary
             * virtual addresses, take care about potential aliasing
             * before reading the page on the kernel side.
             */
            if (writably_mapped) {
                int j;

                for (j = 0; j < thp_nr_pages(page); j++)
                    flush_dcache_page(page + j);
            }

            copied = copy_page_to_iter(page, offset, bytes, iter);

            already_read += copied;
            iocb->ki_pos += copied;
            ra->prev_pos = iocb->ki_pos;

            if (copied < bytes) {
                error = -EFAULT;
                break;
            }
        }
put_pages:
        for (i = 0; i < pagevec_count(&pvec); i++)
            put_page(pvec.pages[i]); // 將page放入page cache緩存
          pagevec_reinit(&pvec);
    } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);

    file_accessed(filp);

    return already_read ? already_read : error;
}

當(dāng)我們從文件讀取數(shù)據(jù)時面睛,在非dio的場景下絮蒿,往往是先判斷文件對應(yīng)的page是否存在page cache中,如果存在并且當(dāng)前的cache不是dirty的那么就可以直接從page cache讀取叁鉴,通過page cache可以大大提升文件的讀寫性能土涝,page cache的讀取具體實現(xiàn)細(xì)節(jié)則在 filemap_get_pages

static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
        struct pagevec *pvec)
{
    struct file *filp = iocb->ki_filp;
    struct address_space *mapping = filp->f_mapping;
    struct file_ra_state *ra = &filp->f_ra;
    pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
    pgoff_t last_index;
    struct page *page;
    int err = 0;

    last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
retry:
    if (fatal_signal_pending(current))
        return -EINTR;

    filemap_get_read_batch(mapping, index, last_index, pvec); // 批量獲取page 
    if (!pagevec_count(pvec)) {
        if (iocb->ki_flags & IOCB_NOIO)
            return -EAGAIN;
        page_cache_sync_readahead(mapping, ra, filp, index,
                last_index - index);
        filemap_get_read_batch(mapping, index, last_index, pvec);
    }
    if (!pagevec_count(pvec)) {
        if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
            return -EAGAIN;
        err = filemap_create_page(filp, mapping,
                iocb->ki_pos >> PAGE_SHIFT, pvec); //在cache中不存在,觸發(fā)缺頁處理幌墓,讀取磁盤
        if (err == AOP_TRUNCATED_PAGE)
            goto retry;
        return err;
    }

    page = pvec->pages[pagevec_count(pvec) - 1];
    if (PageReadahead(page)) {
        err = filemap_readahead(iocb, filp, mapping, page, last_index); // 是否進行預(yù)讀
// 在順序io的情況下但壮,通過預(yù)判進行預(yù)讀可以提升下一次讀取的性能,減少磁盤io
        if (err)
            goto err;
    }
    if (!PageUptodate(page)) {
        if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1)
            iocb->ki_flags |= IOCB_NOWAIT;
        err = filemap_update_page(iocb, mapping, iter, page);
        if (err)
            goto err;
    }

    return 0;
err:
    if (err < 0)
        put_page(page);
    if (likely(--pvec->nr))
        return 0;
    if (err == AOP_TRUNCATED_PAGE)
        goto retry;
    return err;
}

當(dāng)文件的page不存在page cache中常侣,則會觸發(fā)缺頁蜡饵,進入缺頁處理函數(shù) filemap_create_page

static int filemap_create_page(struct file *file,
        struct address_space *mapping, pgoff_t index,
        struct pagevec *pvec)
{
    struct page *page;
    int error;

    page = page_cache_alloc(mapping); // 分配pagecache
    if (!page)
        return -ENOMEM;

    /*
     * Protect against truncate / hole punch. Grabbing invalidate_lock here
     * assures we cannot instantiate and bring uptodate new pagecache pages
     * after evicting page cache during truncate and before actually
     * freeing blocks.  Note that we could release invalidate_lock after
     * inserting the page into page cache as the locked page would then be
     * enough to synchronize with hole punching. But there are code paths
     * such as filemap_update_page() filling in partially uptodate pages or
     * ->readpages() that need to hold invalidate_lock while mapping blocks
     * for IO so let's hold the lock here as well to keep locking rules
     * simple.
     */
    filemap_invalidate_lock_shared(mapping);
    error = add_to_page_cache_lru(page, mapping, index,
            mapping_gfp_constraint(mapping, GFP_KERNEL));// 添加到lru pagecache
    if (error == -EEXIST)
        error = AOP_TRUNCATED_PAGE;
    if (error)
        goto error;

    error = filemap_read_page(file, mapping, page); // 讀取pagecache
    if (error)
        goto error;

    filemap_invalidate_unlock_shared(mapping);
    pagevec_add(pvec, page);
    return 0;
error:
    filemap_invalidate_unlock_shared(mapping);
    put_page(page);
    return error;
}

缺頁處理函數(shù)首先會分配內(nèi)存page,分配內(nèi)存page的實現(xiàn)為 page_alloc_cache 胳施,此處通過內(nèi)核的內(nèi)存分配伙伴系統(tǒng)分配一個page溯祸,伙伴系統(tǒng)的詳細(xì)實現(xiàn)本文先不做深入探討。

#ifdef CONFIG_NUMA
extern struct page *__page_cache_alloc(gfp_t gfp);
#else
static inline struct page *__page_cache_alloc(gfp_t gfp)
{
    return alloc_pages(gfp, 0);
}
#endif

static inline struct page *page_cache_alloc(struct address_space *x)
{
    return __page_cache_alloc(mapping_gfp_mask(x));
}

[alloc_pages] (https://elixir.bootlin.com/linux/v5.15/source/include/linux/gfp.h#L588)

#ifdef CONFIG_NUMA
struct page *alloc_pages(gfp_t gfp, unsigned int order);
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
            struct vm_area_struct *vma, unsigned long addr,
            int node, bool hugepage);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \\
    alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
#else
static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
{
    return alloc_pages_node(numa_node_id(), gfp_mask, order);
}

分配完page后舞肆,則從磁盤讀取文件page filemap_read_page

static int filemap_read_page(struct file *file, struct address_space *mapping,
        struct page *page)
{
    int error;

    /*
     * A previous I/O error may have been due to temporary failures,
     * eg. multipath errors.  PG_error will be set again if readpage
     * fails.
     */
    ClearPageError(page);
    /* Start the actual read. The read will unlock the page. */
    error = mapping->a_ops->readpage(file, page);// readpage 為函數(shù)指針您没,對應(yīng)到ext4的實現(xiàn)為
// 
    if (error)
        return error;

    error = wait_on_page_locked_killable(page);
    if (error)
        return error;
    if (PageUptodate(page))
        return 0;
    shrink_readahead_size_eio(&file->f_ra);
    return -EIO;
}

從物理文件讀取page的核心定義為 a_ops 該結(jié)構(gòu)體定義了從物理文件讀取page的一系列函數(shù),不同的文件系統(tǒng)對應(yīng)到具體不同的函數(shù)實現(xiàn)胆绊。

struct address_space_operations {
    int (*writepage)(struct page *page, struct writeback_control *wbc);
    int (*readpage)(struct file *, struct page *);

    /* Write back some dirty pages from this mapping. */
    int (*writepages)(struct address_space *, struct writeback_control *);

    /* Set a page dirty.  Return true if this dirtied it */
    int (*set_page_dirty)(struct page *page);

    /*
     * Reads in the requested pages. Unlike ->readpage(), this is
     * PURELY used for read-ahead!.
     */
    int (*readpages)(struct file *filp, struct address_space *mapping,
            struct list_head *pages, unsigned nr_pages);
    void (*readahead)(struct readahead_control *);

    int (*write_begin)(struct file *, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata);
    int (*write_end)(struct file *, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned copied,
                struct page *page, void *fsdata);

    /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
    sector_t (*bmap)(struct address_space *, sector_t);
    void (*invalidatepage) (struct page *, unsigned int, unsigned int);
    int (*releasepage) (struct page *, gfp_t);
    void (*freepage)(struct page *);
    ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
    /*
     * migrate the contents of a page to the specified target. If
     * migrate_mode is MIGRATE_ASYNC, it must not block.
     */
    int (*migratepage) (struct address_space *,
            struct page *, struct page *, enum migrate_mode);
    bool (*isolate_page)(struct page *, isolate_mode_t);
    void (*putback_page)(struct page *);
    int (*launder_page) (struct page *);
    int (*is_partially_uptodate) (struct page *, unsigned long,
                    unsigned long);
    void (*is_dirty_writeback) (struct page *, bool *, bool *);
    int (*error_remove_page)(struct address_space *, struct page *);

    /* swapfile support */
    int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
                sector_t *span);
    void (*swap_deactivate)(struct file *file);
};

同樣以ext4為例,ext4 a_ops具體定義實現(xiàn) 相對于的readpage實現(xiàn)為ext4_readpage

static const struct address_space_operations ext4_aops = {
    .readpage       = ext4_readpage,
    .readahead      = ext4_readahead,
    .writepage      = ext4_writepage,
    .writepages     = ext4_writepages,
    .write_begin        = ext4_write_begin,
    .write_end      = ext4_write_end,
    .set_page_dirty     = ext4_set_page_dirty,
    .bmap           = ext4_bmap,
    .invalidatepage     = ext4_invalidatepage,
    .releasepage        = ext4_releasepage,
    .direct_IO      = noop_direct_IO,
    .migratepage        = buffer_migrate_page,
    .is_partially_uptodate  = block_is_partially_uptodate,
    .error_remove_page  = generic_error_remove_page,
    .swap_activate      = ext4_iomap_swap_activate,
};

ext4_readpage

static int ext4_readpage(struct file *file, struct page *page)
{
    int ret = -EAGAIN;
    struct inode *inode = page->mapping->host;

    trace_ext4_readpage(page);

    if (ext4_has_inline_data(inode))
        ret = ext4_readpage_inline(inode, page);

    if (ret == -EAGAIN)
        return ext4_mpage_readpages(inode, NULL, page);

    return ret;
}

ext4_mpage_readpages則構(gòu)造bio請求從塊設(shè)備讀取數(shù)據(jù) 欧募,通過構(gòu)造bio將任務(wù)提交到io調(diào)度器压状,從而向塊設(shè)備驅(qū)動提交讀請求。至此,用戶發(fā)起讀系統(tǒng)調(diào)用請求真正進入磁盤塊設(shè)備讀取物理文件數(shù)據(jù)种冬。

reference:

  1. SYSCALL_DEFINE https://blog.csdn.net/hxmhyp/article/details/22699669
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
  • 序言:七十年代末镣丑,一起剝皮案震驚了整個濱河市,隨后出現(xiàn)的幾起案子娱两,更是在濱河造成了極大的恐慌莺匠,老刑警劉巖,帶你破解...
    沈念sama閱讀 210,914評論 6 490
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件十兢,死亡現(xiàn)場離奇詭異趣竣,居然都是意外死亡,警方通過查閱死者的電腦和手機旱物,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 89,935評論 2 383
  • 文/潘曉璐 我一進店門遥缕,熙熙樓的掌柜王于貴愁眉苦臉地迎上來,“玉大人宵呛,你說我怎么就攤上這事单匣。” “怎么了宝穗?”我有些...
    開封第一講書人閱讀 156,531評論 0 345
  • 文/不壞的土叔 我叫張陵户秤,是天一觀的道長。 經(jīng)常有香客問我逮矛,道長鸡号,這世上最難降的妖魔是什么? 我笑而不...
    開封第一講書人閱讀 56,309評論 1 282
  • 正文 為了忘掉前任橱鹏,我火速辦了婚禮膜蠢,結(jié)果婚禮上,老公的妹妹穿的比我還像新娘莉兰。我一直安慰自己挑围,他們只是感情好,可當(dāng)我...
    茶點故事閱讀 65,381評論 5 384
  • 文/花漫 我一把揭開白布糖荒。 她就那樣靜靜地躺著杉辙,像睡著了一般。 火紅的嫁衣襯著肌膚如雪捶朵。 梳的紋絲不亂的頭發(fā)上蜘矢,一...
    開封第一講書人閱讀 49,730評論 1 289
  • 那天,我揣著相機與錄音综看,去河邊找鬼品腹。 笑死,一個胖子當(dāng)著我的面吹牛红碑,可吹牛的內(nèi)容都是我干的舞吭。 我是一名探鬼主播泡垃,決...
    沈念sama閱讀 38,882評論 3 404
  • 文/蒼蘭香墨 我猛地睜開眼,長吁一口氣:“原來是場噩夢啊……” “哼羡鸥!你這毒婦竟也來了蔑穴?” 一聲冷哼從身側(cè)響起,我...
    開封第一講書人閱讀 37,643評論 0 266
  • 序言:老撾萬榮一對情侶失蹤惧浴,失蹤者是張志新(化名)和其女友劉穎存和,沒想到半個月后,有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體衷旅,經(jīng)...
    沈念sama閱讀 44,095評論 1 303
  • 正文 獨居荒郊野嶺守林人離奇死亡捐腿,尸身上長有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點故事閱讀 36,448評論 2 325
  • 正文 我和宋清朗相戀三年,在試婚紗的時候發(fā)現(xiàn)自己被綠了芜茵。 大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片叙量。...
    茶點故事閱讀 38,566評論 1 339
  • 序言:一個原本活蹦亂跳的男人離奇死亡,死狀恐怖九串,靈堂內(nèi)的尸體忽然破棺而出绞佩,到底是詐尸還是另有隱情,我是刑警寧澤猪钮,帶...
    沈念sama閱讀 34,253評論 4 328
  • 正文 年R本政府宣布品山,位于F島的核電站,受9級特大地震影響烤低,放射性物質(zhì)發(fā)生泄漏肘交。R本人自食惡果不足惜,卻給世界環(huán)境...
    茶點故事閱讀 39,829評論 3 312
  • 文/蒙蒙 一扑馁、第九天 我趴在偏房一處隱蔽的房頂上張望涯呻。 院中可真熱鬧,春花似錦腻要、人聲如沸复罐。這莊子的主人今日做“春日...
    開封第一講書人閱讀 30,715評論 0 21
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽效诅。三九已至,卻和暖如春趟济,著一層夾襖步出監(jiān)牢的瞬間乱投,已是汗流浹背。 一陣腳步聲響...
    開封第一講書人閱讀 31,945評論 1 264
  • 我被黑心中介騙來泰國打工顷编, 沒想到剛下飛機就差點兒被人妖公主榨干…… 1. 我叫王不留戚炫,地道東北人。 一個月前我還...
    沈念sama閱讀 46,248評論 2 360
  • 正文 我出身青樓媳纬,卻偏偏與公主長得像嘹悼,于是被迫代替她去往敵國和親叛甫。 傳聞我的和親對象是個殘疾皇子,可洞房花燭夜當(dāng)晚...
    茶點故事閱讀 43,440評論 2 348

推薦閱讀更多精彩內(nèi)容