http://www.cppblog.com/momoxiao/archive/2010/04/04/111594.html
先通过strace来看下ls命令的执行都做了哪些系统调用:
运行结果,这儿只摘取了ls.txt中我们感兴趣的部分:
fstat64(3, {st_mode=S_IFDIR|0755, st_size=4096, }) = 0 ///取得当前目录文件的属性,比如这里大小为4096
fcntl64(3, F_GETFD) = 0x1 (flags FD_CLOEXEC)
getdents64(3, /* 33 entries */, 4096) = 1104 ///读取当前目录下的文件
getdents64(3, /* 0 entries */, 4096) = 0
close(3) = 0 ///关闭当前目录文件的句柄
这里核心是getdents64系统调用,它读取目录文件中的一个个目录项(directory entry)并返回,所以我们运行ls后才看到文件。
下面我们就看下getdents64是怎么用的,想办法干扰它的执行,从而隐藏掉我们不想让用户发现的文件。
fs/readdir.c
{
struct file * file;
struct linux_dirent64 __user * lastdirent;
struct getdents_callback64 buf;
int error;
error
= -EFAULT;if (!access_ok(VERIFY_WRITE, dirent, count))
goto out;
error
= -EBADF;file = fget(fd);
if (!file)
goto out;
buf.current_dir
= dirent;buf.previous = NULL;
buf.count = count;
buf.error = 0;
error
= vfs_readdir(file, filldir64, &buf); ///读取目录函数if (error < 0)
goto out_putf;
error = buf.error;
lastdirent = buf.previous;
if (lastdirent) {
typeof(lastdirent->d_off) d_off = file->f_pos;
error = -EFAULT;
if (__put_user(d_off, &lastdirent->d_off))
goto out_putf;
error = count - buf.count;
}
out_putf:
fput(file);
return error;
}
首先,在sys_getdents64中通过调用vfs_readdir()读取目录函数。
那么什么是vfs呢?vfs全名Virtual File Switch,就是虚拟文件系统。我们可以把Linux的文件系统看成三层,最上层是上层用户使用的系统调用,中间一层就是vfs,最下面一层是挂载到VFS中的各种实际文件系统,比如ext2,jffs等。Switch这个词在这儿用的很形象,上层同一个系统调用,在vfs这层会根据文件系统的类型,调用对应的内核函数。vfs这层,本身就是起一个switch的作用。
看下vfs_readdir()吧。
fs/readdir.c
{
struct inode *inode = file->f_path.dentry->d_inode;
int res = -ENOTDIR;
if (!file->f_op || !file->f_op->readdir)
goto out;
res
= security_file_permission(file, MAY_READ);if (res)
goto out;
res
= mutex_lock_killable(&inode->i_mutex);if (res)
goto out;
res
= -ENOENT;if (!IS_DEADDIR(inode)) {
res = file->f_op->readdir(file, buf, filler); ///调用实际文件系统的读取目录项(就是文件系统三层结构中最下面一层)
file_accessed(file);
}
mutex_unlock(&inode->i_mutex);
out:
return res;
}
里面file->f_op->readdir()读取底层实际文件系统的目录项。
大致的关系是这样的:
file结构里有个文件操作的函数集const struct file_operations *f_op。
struct file_operations 中实际上是一些函数的指针,readdir就是其中的一个指针。
在调用vir_readdir之前,内核会根据实际文件系统类型给struct file_operations赋对应值。
下面我们通过看代码,获得一个比较直观的认识。
struct file 和 struct file_operations都在/include/linux/fs.h中定义。
file结构:
/*
* fu_list becomes invalid after file_free is called and queued via
* fu_rcuhead for RCU freeing
*/
union {
struct list_head fu_list;
struct rcu_head fu_rcuhead;
} f_u;
struct path f_path;
#define f_dentry f_path.dentry
#define f_vfsmnt f_path.mnt
const struct file_operations *f_op; ///对应每一种实际的文件系统,会有自己的file_operations函数集。可以理解成file这个类的纯虚函数集
atomic_long_t f_count;
unsigned int f_flags;
mode_t f_mode;
loff_t f_pos;
struct fown_struct f_owner;
unsigned int f_uid, f_gid;
struct file_ra_state f_ra;
u64 f_version;
#ifdef CONFIG_SECURITY
#endif
/* needed for tty driver, and maybe others */
void *private_data;
#ifdef CONFIG_EPOLL
struct list_head f_ep_links;
spinlock_t f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
#ifdef CONFIG_DEBUG_WRITECOUNT
unsigned long f_mnt_write_state;
#endif
};
file_operations结构,里面是一些函数指针。我们在这儿关心的是int (*readdir) (struct file *, void *, filldir_t);
readdir()用来读取实际文件系统目录项。
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
int (*readdir) (struct file *, void *, filldir_t); ///我们在这儿关心的函数指针,实际文件系统的读取目录项函数。
///每次打开文件,内核都会根据文件位于的文件系统类型,对文件相应的file_operations赋相应值。
unsigned int (*poll) (struct file *, struct poll_table_struct *);
int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
int (*fsync) (struct file *, struct dentry *, int datasync);
int (*aio_fsync) (struct kiocb *, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
int (*dir_notify)(struct file *filp, unsigned long arg);
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **);
};
下面来看下在ls用到file结构中的file_operations之前,内核是怎样它赋值的
{
struct ext2_inode_info *ei;
struct buffer_head * bh;
struct ext2_inode *raw_inode;
struct inode *inode;
long ret = -EIO;
int n;
inode
= iget_locked(sb, ino);if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
ei
= EXT2_I(inode);#ifdef CONFIG_EXT2_FS_POSIX_ACL
ei->i_acl = EXT2_ACL_NOT_CACHED;
ei->i_default_acl = EXT2_ACL_NOT_CACHED;
#endif
ei->i_block_alloc_info = NULL;
raw_inode
= ext2_get_inode(inode->i_sb, ino, &bh);if (IS_ERR(raw_inode)) {
ret = PTR_ERR(raw_inode);
goto bad_inode;
}
inode
->i_mode = le16_to_cpu(raw_inode->i_mode);inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
if (!(test_opt (inode->i_sb, NO_UID32))) {
inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
inode->i_size = le32_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
* This is needed because nfsd might try to access dead inodes
* the test is that same one that e2fsck uses
* NeilBrown 1999oct15
*/
if (inode->i_nlink == 0 && (inode->i_mode == 0 || ei->i_dtime)) {
/* this inode is deleted */
brelse (bh);
ret = -ESTALE;
goto bad_inode;
}
inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
ei->i_frag_no = raw_inode->i_frag;
ei->i_frag_size = raw_inode->i_fsize;
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
ei->i_dir_acl = 0;
if (S_ISREG(inode->i_mode))
inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
else
ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
ei->i_dtime = 0;
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_state = 0;
ei->i_block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
ei->i_dir_start_lookup = 0;
/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
*/
for (n = 0; n < EXT2_N_BLOCKS; n++)
ei->i_data[n] = raw_inode->i_block[n];
///下面是我们关心的。。。。。。。。。。。。。。。。。。。。。。。。
///这里对inode->fop赋值,就是inode中的file_operations结构。
if (S_ISREG(inode->i_mode)) { ///普通文件(S_ISREG),inode->i_fop为ext2_file_operations函数集
inode->i_op = &ext2_file_inode_operations;
if (ext2_use_xip(inode->i_sb)) { ///???现在不关心
inode->i_mapping->a_ops = &ext2_aops_xip;
inode->i_fop = &ext2_xip_file_operations;
} else if (test_opt(inode->i_sb, NOBH)) {
inode->i_mapping->a_ops = &ext2_nobh_aops;
inode->i_fop = &ext2_file_operations;
} else {
inode->i_mapping->a_ops = &ext2_aops;
inode->i_fop = &ext2_file_operations;
}
} else if (S_ISDIR(inode->i_mode)) { ///目录文件(S_ISDIR),inode->i_fop为ext2_dir_operations函数集
inode->i_op = &ext2_dir_inode_operations;
inode->i_fop = &ext2_dir_operations;
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
} else if (S_ISLNK(inode->i_mode)) { ///链接文件(S_ISLNK),不需要inode->i_fop函数集
if (ext2_inode_is_fast_symlink(inode))
inode->i_op = &ext2_fast_symlink_inode_operations;
else {
inode->i_op = &ext2_symlink_inode_operations;
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
}
} else {
inode->i_op = &ext2_special_inode_operations;
if (raw_inode->i_block[0])
init_special_inode(inode, inode->i_mode,
old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
else
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
}
///以上。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
brelse (bh);
ext2_set_inode_flags(inode);
unlock_new_inode(inode);
return inode;
bad_inode:
iget_failed(inode);
}
上面一段代码把inode中的file_operations赋值为ext2_file_operations。
打开文件用sys_open(),在fs/open.c文件中,函数调用流程如下:
sys_open() --> do_sys_open() --> do_filp_open() --> nameidata_to_filp() --> __dentry_open()
int flags, struct file *f,
int (*open)(struct inode *, struct file *))
{
struct inode *inode;
int error;
f
->f_flags = flags;f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK |
FMODE_PREAD | FMODE_PWRITE;
inode = dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
error = __get_file_write_access(inode, mnt);
if (error)
goto cleanup_file;
if (!special_file(inode->i_mode))
file_take_write(f);
}
f
->f_mapping = inode->i_mapping;f->f_path.dentry = dentry;
f->f_path.mnt = mnt;
f->f_pos = 0;
f->f_op = fops_get(inode->i_fop); ///把inode中file_operations函数集给file中file_operations函数集
file_move(f, &inode->i_sb->s_files);
error
= security_dentry_open(f);if (error)
goto cleanup_all;
if (!open &am