管道文件系统pipefs

概述

在Linux的世界里，万物皆文件，并且都是通过虚拟文件系统VFS来同一管理调用不同的文件系统，因此Linux中可以通过文件IO系统调用来进行操作。而管道就是一个伪文件系统，其通过pipefs来实现。同其他真正的文件系统（ext3、ext4等）一样，都实现VFS中的四种主要对象：super_block、inode、dentry和文件对象file。当对管道进行读写操作时，VFS就会将请求转发给pipefs，而pipefs则会调用自己特定的一些操作函数。

file_system_type操作表

pipefs是一个文件系统，就会有一个被称为file_system_type的数据结构，在系统启动或者文件系统模块挂载时用于在VFS中进行注册。所有的已注册的文件系统的file_system_type结构形成一个链表，链表头由file_systems变量指定。pipefs的file_system_type操作表如下:

static struct file_system_type pipe_fs_type = {
 .name = "pipefs",
 .mount = pipefs_mount,
 .kill_sb = kill_anon_super, //用于移除特殊文件系统的超级块
};

3.18内核版本file_system_type结构体中增加了一个mount成员的钩子函数,

struct dentry *(*mount) (struct file_system_type *, int, const char *, void *);

对于pipefs、sockfs和bdev等伪文件系统调用的该钩子函数实现都是对mount_pseudo函数的封装，该函数主要是根据file_system_type创建一个super_block，并进行一系列的初始化工作，然后根据该超级块和伪文件系统名（pipefs、sockfs和bdev）在内存中分配一个目录项缓存，将其设置为该超级块的根目录的目录项对象，最后返回目录项：

/*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
 * any operations on the root directory. However, we need a non-trivial
 * d_name - pipe: will go nicely and kill the special-casing in procfs.
 */
static struct dentry *pipefs_mount(struct file_system_type *fs_type,
 int flags, const char *dev_name, void *data)
{
 return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
 &pipefs_dentry_operations, PIPEFS_MAGIC);
}

mount_pseudo函数定义如下:

/*
 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
 * will never be mountable)
 * 伪文件系统的常用助手（sockfs，pipefs，bdev - 永远不可安装的东西)
 */struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
 const struct super_operations *ops,
 const struct dentry_operations *dops, unsigned long magic)
{
 struct super_block *s;
 struct dentry *dentry;
 struct inode *root;
 struct qstr d_name = QSTR_INIT(name, strlen(name));
 
 s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL); //调用set_anon_super函数初始化特殊文件系统的超级块；
 if (IS_ERR(s))
 return ERR_CAST(s);
 
 s->s_maxbytes = MAX_LFS_FILESIZE;
 s->s_blocksize = PAGE_SIZE;
 s->s_blocksize_bits = PAGE_SHIFT;
 s->s_magic = magic;
 s->s_op = ops ? ops : &simple_super_operations;
 s->s_time_gran = 1;
 root = new_inode(s);
 if (!root)
 goto Enomem;
 /*
 * since this is the first inode, make it number 1. New inodes created
 * after this must take care not to collide with it (by passing
 * max_reserved of 1 to iunique).
 */
 root->i_ino = 1;
 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
 root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
 //__d_alloc函数分配一个目录项缓存dcache的入口目录，没有足够内存，则返回NULL，分配成功，则返回一个dentry结构体
 dentry = __d_alloc(s, &d_name);
 if (!dentry) {
 iput(root);
 goto Enomem;
 }
 d_instantiate(dentry, root);
 s->s_root = dentry; //设置pipefs文件系统根目录的目录项对象，目录项对象操作表
 s->s_d_op = dops;
 s->s_flags |= MS_ACTIVE;
 return dget(s->s_root); //增加该dentry的引用计数，并返回
 
Enomem:
 deactivate_locked_super(s);
 return ERR_PTR(-ENOMEM);
}

管道文件系统比较简单，只是一个存在于内存中的文件，因而管道文件系统的超级块操作表比较简单，只定义了两个操作函数，free_inode_nonrcu用于释放inode对象；simple_statfs用于获取pipefs文件系统的状态信息

static const struct super_operations pipefs_ops = {
 .destroy_inode = free_inode_nonrcu, //用于释放inode对象
 .statfs = simple_statfs, //用于获取pipefs文件系统的状态信息
};

pipefs初始化

接下来看看pipefs文件系统的初始化，主要是进行pipefs的注册，并进行pipefs伪文件系统的装载（没有挂载点）。

static int __init init_pipe_fs(void)
{
 int err = register_filesystem(&pipe_fs_type);
 
 if (!err) {
 pipe_mnt = kern_mount(&pipe_fs_type);
 if (IS_ERR(pipe_mnt)) {
 err = PTR_ERR(pipe_mnt);
 unregister_filesystem(&pipe_fs_type);
 }
 }
 return err;
}
 
fs_initcall(init_pipe_fs);

以上就是pipefs伪文件系统的一个初始化，挂载过程。接下来看下管道文件具体的IO操作过程。

const struct file_operations pipefifo_fops = {
 .open = fifo_open,
 .llseek = no_llseek,
 .read = new_sync_read,
 .read_iter = pipe_read,
 .write = new_sync_write,
 .write_iter = pipe_write,
 .poll = pipe_poll,
 .unlocked_ioctl = pipe_ioctl,
 .release = pipe_release,
 .fasync = pipe_fasync,
};

new_sync_read函数是一个通用的读函数，其中调用了read_iter的钩子函数，即就是最终还是调用pipe_read函数进行管道的读操作，同理管道的写操作调用pipe_write。那么read和read_iter有什么区别呢？其实在老版本内核中使用read通常每次读取一个缓冲区的内存，若是存在多个缓冲区就需要多次调用read函数。而在新的内核版本中添加了read_iter函数，其是一个聚合读函数，可以同时读取多个缓冲区内容，性能较好。

代码基于Linux 3.18.24