1.概述
我們知道,Linux把設備看成特殊的文件,稱爲設備文件。在操作文件之前,首先必須打開文件,打開文件的函數是通過open系統調用來實現的。而簡單的文件打開操作,在Linux內核實現卻是非常的複雜。open函數打開原理就是將進程files_struct結構體和文件對象file相關聯。那麼具體是怎麼實現的呢?讓我們一起走進Linux內核文件打開流程。
2. 首先,通過系統調用sys_open
函數:
//打開文件的系統調用
asmlinkage long sys_open(const char __user *filename, int flags, int mode)
{
long ret;
if (force_o_largefile())
flags |= O_LARGEFILE;
//調用do_sys_open函數
ret = do_sys_open(AT_FDCWD, filename, flags, mode);
/* avoid REGPARM breakage on x86: */
prevent_tail_call(ret);
return ret;
}
這個函數進行了簡單的處理,調用do_sys_open
函數:
long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
{
/*將從用戶空間傳入的路徑名複製到內核空間*/
char *tmp = getname(filename);
int fd = PTR_ERR(tmp);
if (!IS_ERR(tmp)) {
/*得到一個沒有使用的文件描述符*/
fd = get_unused_fd();
if (fd >= 0) {
/*file對象是文件對象,存在於內存,所以沒有回寫,f_op被賦值*/
struct file *f = do_filp_open(dfd, tmp, flags, mode);
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fsnotify_open(f->f_path.dentry);
/*將current->files_struct和文件對象關聯*/
fd_install(fd, f);
}
}
putname(tmp);
}
return fd;
}
這個函數主要完成以下幾件事情:
(1)調用get_unused_fd
得到一個沒有使用的文件描述符,這是爲讀,寫準備的,每個打開的文件都有一個文件描述符。
(2) 調用do_filp_open
構建 struct file文件對象,並填充相關信息,這個函數非常複雜,我們以後再看。
(3) 調用fd_install
將文件對象和進程的files_struct
對象關聯。
首先看一下get_unused_fd
函數
/*找到一個沒有使用的文件描述符,並標記爲busy
* Find an empty file descriptor entry, and mark it busy.
*/
int get_unused_fd(void)
{
/*得到files_struct結構體*/
struct files_struct * files = current->files;
int fd, error;
/*定義fdtable結構*/
struct fdtable *fdt;
error = -EMFILE;
spin_lock(&files->file_lock);
repeat:
/*返回files的fdt指針*/
fdt = files_fdtable(files);
/*從fdt->open_ds->fds_bits數組查找一個沒有置位的文件描述符,open_ds表示打開的文件描述符集,當位圖爲1表示已經打開,爲0已經關閉*/
fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
files->next_fd);
/*
* N.B. For clone tasks sharing a files structure, this test
* will limit the total number of files that can be opened.
*/
if (fd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
goto out;
/* Do we need to expand the fd array or fd set? */
error = expand_files(files, fd);
if (error < 0)
goto out;
if (error) {
/*
* If we needed to expand the fs array we
* might have blocked - try again.
*/
error = -EMFILE;
goto repeat;
}
/*將文件描述符集合的fd置位*/
FD_SET(fd, fdt->open_fds);
FD_CLR(fd, fdt->close_on_exec);
/*下一個描述符,即搜索的位置加1*/
files->next_fd = fd + 1;
#if 1
/* Sanity check */
if (fdt->fd[fd] != NULL) {
printk(KERN_WARNING "get_unused_fd: slot %d not NULL!\n", fd);
fdt->fd[fd] = NULL;
}
#endif
error = fd;
out:
spin_unlock(&files->file_lock);
return error;
}
在第7行,得到當前進程的files指針。
在第16-18行,返回打開文件表,在打開的文件描述符集open_ds
的fds_bits
數組查找對應位置爲0的位圖,返回位置,表示這個文件描述符沒有被使用。
接下來,在41-43行,分別將open_fds
的fd位置的位圖置位,並將fd+1賦值給下一下文件描述符。如果這個文件描述符被佔用,就將fdt->fd[fd]=NULL
.
最後返回文件描述符fd.
接下來,調do_filp_open
函數,其主要功能是返回一個已經填充好的文件對象指針。這個函數比較複雜,在下一節進行分析。
最後,分析一下fd_install函數,傳入參數文件描述符fd和文件對象f,具體如下:
/*
* Install a file pointer in the fd array.
*
* The VFS is full of places where we drop the files lock between
* setting the open_fds bitmap and installing the file in the file
* array. At any such point, we are vulnerable to a dup2() race
* installing a file in the array before us. We need to detect this and
* fput() the struct file we are about to overwrite in this case.
*
* It should never happen - if we allow dup2() do it, _really_ bad things
* will follow.
*/
//將進程的current->files對象與file文件對象進行綁定,從而直接操作定義的方法
void fastcall fd_install(unsigned int fd, struct file * file)
{
/*進程的files_struct對象*/
struct files_struct *files = current->files;
/*進程文件表*/
struct fdtable *fdt;
spin_lock(&files->file_lock);
/*取得fdt對象*/
fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL);
/*將fdt->fd[fd]指向file對象*/
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
}
這個函數首先得到files_struct
對象指針,然後調用rcu_assign_pointer
,將文件對象file賦給fdt->fd[fd]
, 這樣,文件對象就和進程相關聯起來了。
因此,不同的進程打開相同的文件,每次打開都會構建一個struct file文件對象,然後將這個對象和具體的進程相關聯。其實open調用可以概括如下:
(1)得到一個未使用的文件描述符
(2)構建文件對象struct file
(3)將文件對象和進程相關聯
繼續分析do_filp_open
函數,其傳入4個參數:
dfd:相對目錄
tmp:文件路徑名,例如要打開/usr/src/kernels/linux-2.6.30
flags:打開標誌
mode:打開模式
/*
* Note that while the flag value (low two bits) for sys_open means:
* 00 - read-only
* 01 - write-only
* 10 - read-write
* 11 - special
* it is changed into
* 00 - no permissions needed
* 01 - read-permission
* 10 - write-permission
* 11 - read-write
* for the internal routines (ie open_namei()/follow_link() etc). 00 is
* used by symlinks.
*/
static struct file *do_filp_open(int dfd, const char *filename, int flags,
int mode)
{
int namei_flags, error;
/*創建nameidata結構體,返回的安裝點對象和目錄項對象放在此結構體*/
struct nameidata nd;
namei_flags = flags;
if ((namei_flags+1) & O_ACCMODE)
namei_flags++;
/*根據上級的dentry對象得到新的dentry結構,並從中得到相關的inode節點號,再用iget函數分配新的inode結構,將新的dentry對象與inode對象關聯起來*/
error = open_namei(dfd, filename, namei_flags, mode, &nd);
/*將nameidata結構體轉化爲struct file文件對象結構體*/
if (!error)
return nameidata_to_filp(&nd, flags);
return ERR_PTR(error);
}
此函數調用了open_namei
和nameidata_to_filp
. 後一個函數通過名字就可以猜出來,是將nameidata結構轉化爲filp,也就是利用nd結構賦值給文件指針file,然後返回這個文件指針。而open_namei
肯定是填充nd結構體,具體功能可表述爲: 根據上級目錄項對象,查詢下一級的目錄項對象,如果在目錄項緩存找到下一級的目錄項對象,則直接返回,並填充nd的掛載點對象和目錄項對象。否則,構建一個子目錄項對象,並利用iget函數分配一個新的inode結構,將子目錄項對象和inode結構相關聯。這樣,一直循環到最後一下分量。最後返回的是最後一個分量的目錄項對象和掛載點對象。可以看到,在這兩個函數中,都利用了nameidata結構,具體看一下結構:
struct nameidata {
struct dentry *dentry;/*當前目錄項對象*/
struct vfsmount *mnt;/*已安裝的文件系統對象的地址*/
struct qstr last;/*路徑名最後一部分*/
unsigned int flags;/*查詢標誌*/
int last_type;/*路徑名最後一部分的類型*/
unsigned depth;/*當前符號鏈接的深度,一般小於6*/
char *saved_names[MAX_NESTED_LINKS + 1];/*關聯符號鏈接的路徑名數組*/
/* Intent data */
union {
struct open_intent open;/*想要打開的文件的聯合體*/
} intent;
};
struct open_intent {
int flags;/*標誌*/
int create_mode;/*創建模式*/
struct file *file;/*文件對象指針*/
};
open_intent
文件對象就是最後返回的文件對象。
由於namidata_to_filp
比較簡單,先看一下:
/**將nameidata相關項賦值給struct file對象
* nameidata_to_filp - convert a nameidata to an open filp.
* @nd: pointer to nameidata
* @flags: open flags
*
* Note that this function destroys the original nameidata
*/
struct file *nameidata_to_filp(struct nameidata *nd, int flags)
{
struct file *filp;
/* Pick up the filp from the open intent */
/*取得文件指針*/
filp = nd->intent.open.file;
/* Has the filesystem initialised the file for us? */
/*文件系統是否已經初始化了dentry*/
if (filp->f_path.dentry == NULL)
filp = __dentry_open(nd->dentry, nd->mnt, flags, filp, NULL);
else
path_release(nd);
return filp;
首先取得文件對象指針,然後判斷文件對象是否已經初始化,如果沒有初始化,就調用__dentry_open
函數,對文件對象進行初始化
/*對struct file結構體賦值*/
static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
int flags, struct file *f,
int (*open)(struct inode *, struct file *))
{
struct inode *inode;
int error;
/*設置文件打開標誌*/
f->f_flags = flags;
f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK |
FMODE_PREAD | FMODE_PWRITE;
/*取得inode節點*/
inode = dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
error = get_write_access(inode);
if (error)
goto cleanup_file;
}
/*地址空間對象*/
f->f_mapping = inode->i_mapping;
/*目錄項對象*/
f->f_path.dentry = dentry;
/*掛載點對象*/
f->f_path.mnt = mnt;
/*文件指針位置 */
f->f_pos = 0;
/*inode節點在初始化的時候已經賦值了i_fop,現在將文件操作賦值給f_op*/
f->f_op = fops_get(inode->i_fop);
file_move(f, &inode->i_sb->s_files);
/*文件open操作*/
if (!open && f->f_op)/*open爲NULL*/
open = f->f_op->open;
/*普通文件open爲空,如果是設備文件,需要打開*/
if (open) {
error = open(inode, f);
if (error)
goto cleanup_all;
}
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
/*預讀初始化*/
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
/* NB: we're sure to have correct a_ops only after f_op->open */
if (f->f_flags & O_DIRECT) {
if (!f->f_mapping->a_ops ||
((!f->f_mapping->a_ops->direct_IO) &&
(!f->f_mapping->a_ops->get_xip_page))) {
fput(f);
f = ERR_PTR(-EINVAL);
}
}
return f;
cleanup_all:
fops_put(f->f_op);
if (f->f_mode & FMODE_WRITE)
put_write_access(inode);
file_kill(f);
f->f_path.dentry = NULL;
f->f_path.mnt = NULL;
cleanup_file:
put_filp(f);
dput(dentry);
mntput(mnt);
return ERR_PTR(error);
}
首先,設置文件打開標誌f->f_flags
. 然後初始化地址空間對象,目錄項對象,掛載點對象,文件指針位置,文件相關操作。需要說明兩點:
(1)地址空間對象和索引節點相關聯,在構建索引節點時已經賦值了。它涉及到具體的磁盤塊操作,在後面的章節將會解釋。
(2)f_op
這個非常重要,也是在構建索引節點時,將具體文件系統的文件操作函數集的指針賦給索引節點的i_fop域。對於打開文件,目錄,符號鏈接,對應的操作函數集是不相同的。
接下來,第31行-38行,如果是普通文件,可能不需要打開。如果是設備文件,就需要打開操作。例如SCSI設備的sg_open
函數。
最後,對文件預讀進行初始化。
在說完nameidata_to_filp
函數之後,需要解釋open_namei
函數:
/*
* open_namei()
*
* namei for open - this is in fact almost the whole open-routine.
*
* Note that the low bits of "flag" aren't the same as in the open
* system call - they are 00 - no permissions needed
* 01 - read permission needed
* 10 - write permission needed
* 11 - read/write permissions needed
* which is a lot more logical, and also allows the "no perm" needed
* for symlinks (where the permissions are checked later).
* SMP-safe
*/
int open_namei(int dfd, const char *pathname, int flag,
int mode, struct nameidata *nd)
{
int acc_mode, error;
/*定義path結構,包括安裝點對象和目錄項對象*/
struct path path;
struct dentry *dir;
int count = 0;
acc_mode = ACC_MODE(flag);
/* O_TRUNC implies we need access checks for write permissions */
/*截斷標誌,需要寫權限*/
if (flag & O_TRUNC)
acc_mode |= MAY_WRITE;
/* Allow the LSM permission hook to distinguish append
access from general write access. */
if (flag & O_APPEND)
acc_mode |= MAY_APPEND;
/*
* The simplest case - just a plain lookup.
不需要創建文件,直接打開文件即可,創建目錄項對象和掛載點對象,並將它們填充到nd結構體
*/
if (!(flag & O_CREAT)) {
error = path_lookup_open(dfd, pathname, lookup_flags(flag),
nd, flag);
if (error)
return error;
goto ok;
}
/*
* Create - we need to know the parent.
,由於是創建文件,即文件不存在,所以返回父目錄項對象
在創建文件時設置 LOOKUP_PARENT
*/
error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
if (error)
return error;
/*
* We have the parent and last component. First of all, check
* that we are not asked to creat(2) an obvious directory - that
* will not do.
*/
error = -EISDIR;
if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
goto exit;
/*對於創建文件,nd保存了上一個分量的目錄項對象和掛載點對象。對於打開文件,nd保存了最後一個分量的目錄項對象和掛載點對象*/
dir = nd->dentry;
nd->flags &= ~LOOKUP_PARENT;
mutex_lock(&dir->d_inode->i_mutex);
/*將path.dentry和mnt賦值*/
path.dentry = lookup_hash(nd);
path.mnt = nd->mnt;
do_last:
error = PTR_ERR(path.dentry);
if (IS_ERR(path.dentry)) {
mutex_unlock(&dir->d_inode->i_mutex);
goto exit;
}
if (IS_ERR(nd->intent.open.file)) {
mutex_unlock(&dir->d_inode->i_mutex);
error = PTR_ERR(nd->intent.open.file);
goto exit_dput;
}
/* Negative dentry, just create the file */
/*如果是創建文件*/
if (!path.dentry->d_inode) {
/*創建索引節點,並標識爲*/
error = open_namei_create(nd, &path, flag, mode);
if (error)
goto exit;
return 0;
}
/*
* It already exists.
*/
mutex_unlock(&dir->d_inode->i_mutex);
audit_inode_update(path.dentry->d_inode);
error = -EEXIST;
if (flag & O_EXCL)
goto exit_dput;
if (__follow_mount(&path)) {
error = -ELOOP;
if (flag & O_NOFOLLOW)
goto exit_dput;
}
error = -ENOENT;
if (!path.dentry->d_inode)
goto exit_dput;
if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
goto do_link;
/*將path的目錄項對象和掛載點對象賦給nd*/
path_to_nameidata(&path, nd);
error = -EISDIR;
if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
goto exit;
ok:
error = may_open(nd, acc_mode, flag);
if (error)
goto exit;
return 0;
exit_dput:
dput_path(&path, nd);
exit:
if (!IS_ERR(nd->intent.open.file))
release_open_intent(nd);
path_release(nd);
return error;
do_link:
error = -ELOOP;
if (flag & O_NOFOLLOW)
goto exit_dput;
/*
* This is subtle. Instead of calling do_follow_link() we do the
* thing by hands. The reason is that this way we have zero link_count
* and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
* After that we have the parent and last component, i.e.
* we are in the same situation as after the first path_walk().
* Well, almost - if the last component is normal we get its copy
* stored in nd->last.name and we will have to putname() it when we
* are done. Procfs-like symlinks just set LAST_BIND.
*/
nd->flags |= LOOKUP_PARENT;
error = security_inode_follow_link(path.dentry, nd);
if (error)
goto exit_dput;
error = __do_follow_link(&path, nd);
if (error) {
/* Does someone understand code flow here? Or it is only
* me so stupid? Anathema to whoever designed this non-sense
* with "intent.open".
*/
release_open_intent(nd);
return error;
}
nd->flags &= ~LOOKUP_PARENT;
if (nd->last_type == LAST_BIND)
goto ok;
error = -EISDIR;
if (nd->last_type != LAST_NORM)
goto exit;
if (nd->last.name[nd->last.len]) {
__putname(nd->last.name);
goto exit;
}
error = -ELOOP;
if (count++==32) {
__putname(nd->last.name);
goto exit;
}
dir = nd->dentry;
mutex_lock(&dir->d_inode->i_mutex);
path.dentry = lookup_hash(nd);
path.mnt = nd->mnt;
__putname(nd->last.name);
goto do_last;
}
首先進行文件打開設置工作,第40行,如果是打開操作,則調用path_lookup_open
函數。第53行,如果文件不存在,就創建一個文件,調用path_lookup_create
函數。在第88行,如果是創建文件,需要建立磁盤上的索引節點,即調用open_namei_create
函數。我們逐一解釋:
首先path_lookup_open
函數:
/**
* path_lookup_open - lookup a file path with open intent
* @dfd: the directory to use as base, or AT_FDCWD
* @name: pointer to file name
* @lookup_flags: lookup intent flags
* @nd: pointer to nameidata
* @open_flags: open intent flags
*/
int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
struct nameidata *nd, int open_flags)
{
return __path_lookup_intent_open(dfd, name, lookup_flags, nd,
open_flags, 0);
}
也封裝了__path_lookup_intent_open函數,只是增加了創建標誌LOOKUP_CREATE, 在create操作的lookup_flags設置了LOOKUP_PARENT,接下來,將看到這個標誌的作用。
繼續跟蹤__path_lookup_intent_open函數:
static int __path_lookup_intent_open(int dfd, const char *name,
unsigned int lookup_flags, struct nameidata *nd,
int open_flags, int create_mode)
{
/*分配struct file對象指針*/
struct file *filp = get_empty_filp();
int err;
if (filp == NULL)
return -ENFILE;
/*想要打開的文件*/
nd->intent.open.file = filp;
/*打開標誌*/
nd->intent.open.flags = open_flags;
/*創建模式*/
nd->intent.open.create_mode = create_mode;
/*調用do_path_lookup函數,設置LOOKUP_OPEN*/
err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
if (IS_ERR(nd->intent.open.file)) {
if (err == 0) {
err = PTR_ERR(nd->intent.open.file);
path_release(nd);
}
} else if (err != 0)
release_open_intent(nd);
return err;
}
首先調用get_empty_flip
函數分配一個空閒的文件對象filp, 設置intent.open
的相關域,包括“想要打開的文件”,打開標誌和創建模式。最後,調用do_path_lookup
對文件路徑進行解析,並填充nd。
/*路徑查找函數do_path_lookup*/
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int fastcall do_path_lookup(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
{
int retval = 0;
int fput_needed;
struct file *file;
struct fs_struct *fs = current->fs;
/*如果只有斜線號,設置最後一個分量的類型爲LAST_ROOT*/
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags;
nd->depth = 0;
/*如果是從根目錄開始查找*/
if (*name=='/') {
read_lock(&fs->lock);
if (fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
/*nd->mnt設置爲根安裝點*/
nd->mnt = mntget(fs->altrootmnt);
/*nd->dentry開始目錄項對象設置爲根目錄項對象*/
nd->dentry = dget(fs->altroot);
read_unlock(&fs->lock);
if (__emul_lookup_dentry(name,nd))
goto out; /* found in altroot */
read_lock(&fs->lock);
}
/*增加安裝點的引用計數*/
nd->mnt = mntget(fs->rootmnt);
/*增加目錄項的使用計數*/
nd->dentry = dget(fs->root);
read_unlock(&fs->lock);
/*如果是當前工作目錄*/
} else if (dfd == AT_FDCWD) {
read_lock(&fs->lock);
/*從進程的fs_struct對象找到當前掛載點對象*/
nd->mnt = mntget(fs->pwdmnt);
/*從進程的fs_struct對象找到當前目錄的目錄項對象*/
nd->dentry = dget(fs->pwd);
read_unlock(&fs->lock);
} else {/*當dfd!=AT_FDCWD,這種情況也是有可能出現的*/
struct dentry *dentry;
/*根據dfd得到file對象*/
file = fget_light(dfd, &fput_needed);
retval = -EBADF;
if (!file)
goto out_fail;
/*目錄項對象*/
dentry = file->f_path.dentry;
retval = -ENOTDIR;
if (!S_ISDIR(dentry->d_inode->i_mode))
goto fput_fail;
retval = file_permission(file, MAY_EXEC);
if (retval)
goto fput_fail;
/*nd->mnt賦值*/
nd->mnt = mntget(file->f_path.mnt);
/*nd->dentry賦值,f_path.dentry是和文件相關的目錄項對象*/
nd->dentry = dget(dentry);
fput_light(file, fput_needed);
}
current->total_link_count = 0;
/*路徑分解函數,調用實際文件系統操作*/
retval = link_path_walk(name, nd);
out:
if (likely(retval == 0)) {
if (unlikely(!audit_dummy_context() && nd && nd->dentry &&
nd->dentry->d_inode))
audit_inode(name, nd->dentry->d_inode);
}
out_fail:
return retval;
fput_fail:
fput_light(file, fput_needed);
goto out_fail;
}
第11-14行,設置初始化nd->last_type
, flags和depth. 其中depth表示符號鏈接的深度。由於符號鏈接可以鏈接自己,因此需要限制鏈接的深度。
第16行,如果第一個字符爲/,表示從根目錄開始解析,設置nd->mnt爲根掛載點對象,nd->dentry
爲根目錄項對象,然後增加引用計數。
第34行,如果是從當前目錄開始,將nd->mnt
設置爲當前目錄的掛載點對象,nd->dentry
設置爲當前目錄的目錄項對象。
第41行,否則,將nd->mnt
和nd->dentry
分別設置爲f_path.mnt
和f_pat.dentry
.
接下來,第63行,初始化符號鏈接總數,調用實際文件系統的路徑分解函數link_path_walk
.
int fastcall link_path_walk(const char *name, struct nameidata *nd)
{
struct nameidata save = *nd;
int result;
/* make sure the stuff we saved doesn't go away */
/*首先備份一下安裝點對象和目錄項對象*/
dget(save.dentry);
mntget(save.mnt);
/*真正的名稱解析函數*/
result = __link_path_walk(name, nd);
if (result == -ESTALE) {
*nd = save;
dget(nd->dentry);
mntget(nd->mnt);
nd->flags |= LOOKUP_REVAL;
result = __link_path_walk(name, nd);
}
/*減少並釋放備份的nameidata對象*/
dput(save.dentry);
mntput(save.mnt);
return result;
}
首先,備份掛載點對象和目錄項對象,然後調用__link_path_walk
解析.