本文共 12432 字,大约阅读时间需要 41 分钟。
http://www.cppblog.com/momoxiao/archive/2010/04/04/111594.html
先通过strace来看下ls命令的执行都做了哪些系统调用:
运行结果,这儿只摘取了ls.txt中我们感兴趣的部分:
open( " . " , O_RDONLY | O_NONBLOCK | O_LARGEFILE | O_DIRECTORY | O_CLOEXEC) = 3 /// 打开当前目录这个文件(目录是一种特殊的文件),并返回文件句柄3 fstat64( 3 , {st_mode = S_IFDIR | 0755 , st_size = 4096 ,
}) = 0 /// 取得当前目录文件的属性,比如这里大小为4096 fcntl64( 3 , F_GETFD) = 0x1 (flags FD_CLOEXEC) getdents64( 3 , /* 33 entries */ , 4096 ) = 1104 /// 读取当前目录下的文件 getdents64( 3 , /* 0 entries */ , 4096 ) = 0 close( 3 ) = 0 /// 关闭当前目录文件的句柄 这里核心是getdents64系统调用,它读取目录文件中的一个个目录项(directory entry)并返回,所以我们运行ls后才看到文件。
下面我们就看下getdents64是怎么用的,想办法干扰它的执行,从而隐藏掉我们不想让用户发现的文件。
fs/readdir.c
asmlinkage long sys_getdents64(unsigned int fd, struct linux_dirent64 __user * dirent, unsigned int count) { struct file * file; struct linux_dirent64 __user * lastdirent; struct getdents_callback64 buf; int error; error = - EFAULT; if ( ! access_ok(VERIFY_WRITE, dirent, count)) goto out ; error = - EBADF; file = fget(fd); if ( ! file) goto out ; buf.current_dir = dirent; buf.previous = NULL; buf.count = count; buf.error = 0 ; error = vfs_readdir(file, filldir64, & buf); /// 读取目录函数 if (error < 0 ) goto out_putf; error = buf.error; lastdirent = buf.previous; if (lastdirent) { typeof (lastdirent -> d_off) d_off = file -> f_pos; error = - EFAULT; if (__put_user(d_off, & lastdirent -> d_off)) goto out_putf; error = count - buf.count; } out_putf: fput(file); out : return error; } 首先,在sys_getdents64中通过调用vfs_readdir()读取目录函数。
那么什么是vfs呢?vfs全名Virtual File Switch,就是虚拟文件系统。我们可以把Linux的文件系统看成三层,最上层是上层用户使用的系统调用,中间一层就是vfs,最下面一层是挂载到VFS中的各种实际文件系统,比如ext2,jffs等。Switch这个词在这儿用的很形象,上层同一个系统调用,在vfs这层会根据文件系统的类型,调用对应的内核函数。vfs这层,本身就是起一个switch的作用。
看下vfs_readdir()吧。
fs/readdir.c int vfs_readdir( struct file * file, filldir_t filler, void * buf) { struct inode * inode = file -> f_path.dentry -> d_inode; int res = - ENOTDIR; if ( ! file -> f_op || ! file -> f_op -> readdir) goto out ; res = security_file_permission(file, MAY_READ); if (res) goto out ; res = mutex_lock_killable( & inode -> i_mutex); if (res) goto out ; res = - ENOENT; if ( ! IS_DEADDIR(inode)) { res = file -> f_op -> readdir(file, buf, filler); /// 调用实际文件系统的读取目录项(就是文件系统三层结构中最下面一层) file_accessed(file); } mutex_unlock( & inode -> i_mutex); out : return res; } 里面file->f_op->readdir()读取底层实际文件系统的目录项。
大致的关系是这样的:
file结构里有个文件操作的函数集const struct file_operations *f_op。
struct file_operations 中实际上是一些函数的指针,readdir就是其中的一个指针。
在调用vir_readdir之前,内核会根据实际文件系统类型给struct file_operations赋对应值。
下面我们通过看代码,获得一个比较直观的认识。
struct file 和 struct file_operations都在/include/linux/fs.h中定义。
file结构:
struct file { /* * fu_list becomes invalid after file_free is called and queued via * fu_rcuhead for RCU freeing */ union { struct list_head fu_list; struct rcu_head fu_rcuhead; } f_u; struct path f_path; #define f_dentry f_path.dentry #define f_vfsmnt f_path.mnt const struct file_operations * f_op; /// 对应每一种实际的文件系统,会有自己的file_operations函数集。可以理解成file这个类的纯虚函数集 atomic_long_t f_count; unsigned int f_flags; mode_t f_mode; loff_t f_pos; struct fown_struct f_owner; unsigned int f_uid, f_gid; struct file_ra_state f_ra; u64 f_version; #ifdef CONFIG_SECURITY void * f_security; #endif /* needed for tty driver, and maybe others */ void * private_data; #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ struct list_head f_ep_links; spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ struct address_space * f_mapping; #ifdef CONFIG_DEBUG_WRITECOUNT unsigned long f_mnt_write_state; #endif }; file_operations结构,里面是一些函数指针。我们在这儿关心的是int (*readdir) (struct file *, void *, filldir_t);
readdir()用来读取实际文件系统目录项。
struct file_operations { struct module * owner; loff_t ( * llseek) ( struct file * , loff_t, int ); ssize_t ( * read) ( struct file * , char __user * , size_t, loff_t * ); ssize_t ( * write) ( struct file * , const char __user * , size_t, loff_t * ); ssize_t ( * aio_read) ( struct kiocb * , const struct iovec * , unsigned long , loff_t); ssize_t ( * aio_write) ( struct kiocb * , const struct iovec * , unsigned long , loff_t); int ( * readdir) ( struct file * , void * , filldir_t); /// 我们在这儿关心的函数指针,实际文件系统的读取目录项函数。 /// 每次打开文件,内核都会根据文件位于的文件系统类型,对文件相应的file_operations赋相应值。 unsigned int ( * poll) ( struct file * , struct poll_table_struct * ); int ( * ioctl) ( struct inode * , struct file * , unsigned int , unsigned long ); long ( * unlocked_ioctl) ( struct file * , unsigned int , unsigned long ); long ( * compat_ioctl) ( struct file * , unsigned int , unsigned long ); int ( * mmap) ( struct file * , struct vm_area_struct * ); int ( * open) ( struct inode * , struct file * ); int ( * flush) ( struct file * , fl_owner_t id); int ( * release) ( struct inode * , struct file * ); int ( * fsync) ( struct file * , struct dentry * , int datasync); int ( * aio_fsync) ( struct kiocb * , int datasync); int ( * fasync) ( int , struct file * , int ); int ( * lock ) ( struct file * , int , struct file_lock * ); ssize_t ( * sendpage) ( struct file * , struct page * , int , size_t, loff_t * , int ); unsigned long ( * get_unmapped_area)( struct file * , unsigned long , unsigned long , unsigned long , unsigned long ); int ( * check_flags)( int ); int ( * dir_notify)( struct file * filp, unsigned long arg); int ( * flock) ( struct file * , int , struct file_lock * ); ssize_t ( * splice_write)( struct pipe_inode_info * , struct file * , loff_t * , size_t, unsigned int ); ssize_t ( * splice_read)( struct file * , loff_t * , struct pipe_inode_info * , size_t, unsigned int ); int ( * setlease)( struct file * , long , struct file_lock ** ); }; 下面来看下在ls用到file结构中的file_operations之前,内核是怎样它赋值的
struct inode * ext2_iget ( struct super_block * sb, unsigned long ino) { struct ext2_inode_info * ei; struct buffer_head * bh; struct ext2_inode * raw_inode; struct inode * inode; long ret = - EIO; int n; inode = iget_locked(sb, ino); if ( ! inode) return ERR_PTR( - ENOMEM); if ( ! (inode -> i_state & I_NEW)) return inode; ei = EXT2_I(inode); #ifdef CONFIG_EXT2_FS_POSIX_ACL ei -> i_acl = EXT2_ACL_NOT_CACHED; ei -> i_default_acl = EXT2_ACL_NOT_CACHED; #endif ei -> i_block_alloc_info = NULL; raw_inode = ext2_get_inode(inode -> i_sb, ino, & bh); if (IS_ERR(raw_inode)) { ret = PTR_ERR(raw_inode); goto bad_inode; } inode -> i_mode = le16_to_cpu(raw_inode -> i_mode); inode -> i_uid = (uid_t)le16_to_cpu(raw_inode -> i_uid_low); inode -> i_gid = (gid_t)le16_to_cpu(raw_inode -> i_gid_low); if ( ! (test_opt (inode -> i_sb, NO_UID32))) { inode -> i_uid |= le16_to_cpu(raw_inode -> i_uid_high) << 16 ; inode -> i_gid |= le16_to_cpu(raw_inode -> i_gid_high) << 16 ; } inode -> i_nlink = le16_to_cpu(raw_inode -> i_links_count); inode -> i_size = le32_to_cpu(raw_inode -> i_size); inode -> i_atime.tv_sec = (signed)le32_to_cpu(raw_inode -> i_atime); inode -> i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode -> i_ctime); inode -> i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode -> i_mtime); inode -> i_atime.tv_nsec = inode -> i_mtime.tv_nsec = inode -> i_ctime.tv_nsec = 0 ; ei -> i_dtime = le32_to_cpu(raw_inode -> i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes * the test is that same one that e2fsck uses * NeilBrown 1999oct15 */ if (inode -> i_nlink == 0 && (inode -> i_mode == 0 || ei -> i_dtime)) { /* this inode is deleted */ brelse (bh); ret = - ESTALE; goto bad_inode; } inode -> i_blocks = le32_to_cpu(raw_inode -> i_blocks); ei -> i_flags = le32_to_cpu(raw_inode -> i_flags); ei -> i_faddr = le32_to_cpu(raw_inode -> i_faddr); ei -> i_frag_no = raw_inode -> i_frag; ei -> i_frag_size = raw_inode -> i_fsize; ei -> i_file_acl = le32_to_cpu(raw_inode -> i_file_acl); ei -> i_dir_acl = 0 ; if (S_ISREG(inode -> i_mode)) inode -> i_size |= ((__u64)le32_to_cpu(raw_inode -> i_size_high)) << 32 ; else ei -> i_dir_acl = le32_to_cpu(raw_inode -> i_dir_acl); ei -> i_dtime = 0 ; inode -> i_generation = le32_to_cpu(raw_inode -> i_generation); ei -> i_state = 0 ; ei -> i_block_group = (ino - 1 ) / EXT2_INODES_PER_GROUP(inode -> i_sb); ei -> i_dir_start_lookup = 0 ; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! */ for (n = 0 ; n < EXT2_N_BLOCKS; n ++ ) ei -> i_data[n] = raw_inode -> i_block[n]; /// 下面是我们关心的。。。。。。。。。。。。。。。。。。。。。。。。 /// 这里对inode->fop赋值,就是inode中的file_operations结构。 if (S_ISREG(inode -> i_mode)) { /// 普通文件(S_ISREG),inode->i_fop为ext2_file_operations函数集 inode -> i_op = & ext2_file_inode_operations; if (ext2_use_xip(inode -> i_sb)) { /// ???现在不关心 inode -> i_mapping -> a_ops = & ext2_aops_xip; inode -> i_fop = & ext2_xip_file_operations; } else if (test_opt(inode -> i_sb, NOBH)) { inode -> i_mapping -> a_ops = & ext2_nobh_aops; inode -> i_fop = & ext2_file_operations; } else { inode -> i_mapping -> a_ops = & ext2_aops; inode -> i_fop = & ext2_file_operations; } } else if (S_ISDIR(inode -> i_mode)) { /// 目录文件(S_ISDIR),inode->i_fop为ext2_dir_operations函数集 inode -> i_op = & ext2_dir_inode_operations; inode -> i_fop = & ext2_dir_operations; if (test_opt(inode -> i_sb, NOBH)) inode -> i_mapping -> a_ops = & ext2_nobh_aops; else inode -> i_mapping -> a_ops = & ext2_aops; } else if (S_ISLNK(inode -> i_mode)) { /// 链接文件(S_ISLNK),不需要inode->i_fop函数集 if (ext2_inode_is_fast_symlink(inode)) inode -> i_op = & ext2_fast_symlink_inode_operations; else { inode -> i_op = & ext2_symlink_inode_operations; if (test_opt(inode -> i_sb, NOBH)) inode -> i_mapping -> a_ops = & ext2_nobh_aops; else inode -> i_mapping -> a_ops = & ext2_aops; } } else { inode -> i_op = & ext2_special_inode_operations; if (raw_inode -> i_block[ 0 ]) init_special_inode(inode, inode -> i_mode, old_decode_dev(le32_to_cpu(raw_inode -> i_block[ 0 ]))); else init_special_inode(inode, inode -> i_mode, new_decode_dev(le32_to_cpu(raw_inode -> i_block[ 1 ]))); } /// 以上。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。 brelse (bh); ext2_set_inode_flags(inode); unlock_new_inode(inode); return inode; bad_inode: iget_failed(inode); return ERR_PTR(ret); } 上面一段代码把inode中的file_operations赋值为ext2_file_operations。
打开文件用sys_open(),在fs/open.c文件中,函数调用流程如下:
sys_open() --> do_sys_open() --> do_filp_open() --> nameidata_to_filp() --> __dentry_open()
static struct file * __dentry_open( struct dentry * dentry, struct vfsmount * mnt, int flags, struct file * f, int ( * open)( struct inode * , struct file * )) { struct inode * inode; int error; f -> f_flags = flags; f -> f_mode = ((flags + 1 ) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; inode = dentry -> d_inode; if (f -> f_mode & FMODE_WRITE) { error = __get_file_write_access(inode, mnt); if (error) goto cleanup_file; if ( ! special_file(inode -> i_mode)) file_take_write(f); } f -> f_mapping = inode -> i_mapping; f -> f_path.dentry = dentry; f -> f_path.mnt = mnt; f -> f_pos = 0 ; f -> f_op = fops_get(inode -> i_fop); /// 把inode中file_operations函数集给file中file_operations函数集 file_move(f, & inode -> i_sb -> s_files); error = security_dentry_open(f); if (error) goto cleanup_all; if ( ! open && f -> f_op) open = f -> f_op -> open; if (open) { error = open(inode, f); if (error) goto cleanup_all; } f -> f_flags &= ~ (O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init( & f -> f_ra, f -> f_mapping -> host -> i_mapping); /* NB: we're sure to have correct a_ops only after f_op->open */ if (f -> f_flags & O_DIRECT) { if ( ! f -> f_mapping -> a_ops || (( ! f -> f_mapping -> a_ops -> direct_IO) && ( ! f -> f_mapping -> a_ops -> get_xip_mem))) { fput(f); f = ERR_PTR( - EINVAL); } } return f; cleanup_all: fops_put(f -> f_op); if (f -> f_mode & FMODE_WRITE) { put_write_access(inode); if ( ! special_file(inode -> i_mode)) { /* * We don't consider this a real * mnt_want/drop_write() pair * because it all happenend right * here, so just reset the state. */ file_reset_write(f); mnt_drop_write(mnt); } } file_kill(f); f -> f_path.dentry = NULL; f -> f_path.mnt = NULL; cleanup_file: put_filp(f); dput(dentry); mntput(mnt); return ERR_PTR(error); } 在这儿,f
-> f_op = fops_get(inode -> i_fop); 把file结构中的file_operations函数集赋值成inode中的函数集,也就是ext2_file_operations。 下面归纳下ls执行的整个流程:
假设当前目录在ext2文件系统上,ls要查看当前目录下的文件,
1.open打开当前目录的句柄,这个句柄对应内核中一个file结构。
file结构中的file_operations函数集从inode结构中获得,就是ext2_file_operations
2.getdents64调用file->f_op->readdir()实际上是调用了ext2_file_operations中的readdir(),
由ext2文件系统驱动读取当前目录下面的文件项。
我们要隐藏一个文件,要做的就是替换file->f_op->readdir(),也就是替换ext2_file_operations中的readdir()。
转载地址:http://orlmb.baihongyu.com/