前言

     容器标准包 bundle 与配置

  1. config.json: 基本配置文件,包括与宿主机独立的和应用相关的特定信息,如安全权限、环境变量和参数等。具体如下:

  • 容器版本
  • rootfs 路径及权限(ro / rw)
  • 各类文件挂载点及相应容器内挂载目录(必须与 state.json 一致)
  • 初始进程配置信息,包括是否绑定终端、工作目录、环境变量配置、可执行文件参数、uid、gid 以及额外需要加入的 hostname 等
  1. state.json: 运行时配置文件(运行时主机相关的信息,如内存限制、设备访问权限、挂载点等)
  2. rootfs:根文件系统目录,容器执行的环境依赖,如/bin/var/lib/dev/usr等目录及相应文件

    命令

         

    runc run -h
    NAME:
       runc run - create and run a container


    USAGE:
       runc run [command options] <container-id>


    Where "<container-id>" is your name for the instance of the container that you
    are starting. The name you provide for the container instance must be unique on
    your host.


    DESCRIPTION:
       The run command creates an instance of a container for a bundle. The bundle
    is a directory with a specification file named "config.json" and a root
    filesystem.


    The specification file includes an args parameter. The args parameter is used
    to specify command(s) that get run when the container is started. To change the
    command(s) that get executed on start, edit the args parameter of the spec. See
    "runc spec --help" for more explanation.


    OPTIONS:
       --bundle value, -b value  path to the root of the bundle directory, defaults to the current directory
       --console-socket value    path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal
       --detach, -d              detach from the container's process
       --pid-file value          specify the file to write the process id to
       --no-subreaper            disable the use of the subreaper used to reap reparented processes
       --no-pivot                do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk
       --no-new-keyring          do not create a new session keyring for the container.  This will cause the container to inherit the calling processes session key
       --preserve-fds value      Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0)


    一. runc create 命令分析

       1.1 路径 create.go,命令行内容,主要是 Action 定义的函数中的 startContainer 函数 1.2 讲解

    var createCommand = cli.Command{
           Name:  "create",
           。。。。。。
           },
           Action: func(context *cli.Context) error {
                  if err := checkArgs(context, 1, exactArgs); err != nil {
                         return err
                  }
                  if err := revisePidFile(context); err != nil {
                         return err
                  }
                  spec, err := setupSpec(context)
                  status, err := startContainer(context, spec, CT_ACT_CREATE, nil)
                  return nil
           },
    }

        1.2 startContainer 函数中参数 action CT_ACT_CREATE 为创建,后面会用到。获得启动命令行的容器 id,createContainer 函数 1.2.1 讲解,runner 结构体以及 run 函数内容多一些第三章节进行讲解 
    func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
           id := context.Args().First()
        
           container, err := createContainer(context, id, spec)
            r := &runner{
                  enableSubreaper: !context.Bool("no-subreaper"),
                  shouldDestroy:   true,
                 。。。。。。
           }
           return r.run(spec.Process)
    }

        1.2.1 createContainer 函数中 CreateLibcontainerConfig 创建一个配置来创建容器,例如 bundle 路径,namespace,capatilities,标签等一堆堆,loadFactory 1.2.1.1 讲解,factory.Create 函数在 1.2.1.2 讲解
    func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
           config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
                  CgroupName:       id,
                  UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
                  NoPivotRoot:      context.Bool("no-pivot"),
                  NoNewKeyring:     context.Bool("no-new-keyring"),
                  Spec:             spec,
                  Rootless:         isRootless(),
           })
           if err != nil {
                  return nil, err
           }
    
           factory, err := loadFactory(context)
           if err != nil {
                  return nil, err
           }
           return factory.Create(id, config)
    

        1.2.1.1 loadFactory 为容器返回配置化的实例 factory
    // loadFactory returns the configured factory instance for execing containers.
    func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
           root := context.GlobalString("root")
           abs, err := filepath.Abs(root)
           if err != nil {
                  return nil, err
           }
           cgroupManager := libcontainer.Cgroupfs
           if context.GlobalBool("systemd-cgroup") {
                  if systemd.UseSystemd() {
                         cgroupManager = libcontainer.SystemdCgroups
                  } else {
                         return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
                  }
           }
           return libcontainer.New(abs, cgroupManager, libcontainer.CriuPath(context.GlobalString("criu")))
    }

        cgroupManager 指向 Cgroupfs 函数,配置 cgroup Manager 接口指向 fs 结构体 Manager 实现了其接口
    func Cgroupfs(l *LinuxFactory) error {
           l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
                  return &fs.Manager{
                         Cgroups: config,
                         Paths:   paths,
                  }
           }
           return nil
    }
     
        libcontainer.New 返回一个 linux 系统实现的结构体,根据传入的参数配置结构体中的 NewCgroupsManager 和 CriuPath
    // New returns a linux based container factory based in the root directory and
    // configures the factory with the provided option funcs.
    func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
           if root != "" {
                  if err := os.MkdirAll(root, 0700); err != nil {
                         return nil, newGenericError(err, SystemError)
                  }
           }
           l := &LinuxFactory{
                  Root:      root,
                  InitArgs:  []string{"/proc/self/exe", "init"},
                  Validator: validate.New(),
                  CriuPath:  "criu",
           }
           Cgroupfs(l)
           for _, opt := range options {
                  if err := opt(l); err != nil {
                         return nil, err
                  }
           }
           return l, nil
    }

       1.2.1.2 Create 做的事情比较简单,对配置进行一些验证工作,验证成功创建容器的根路径并设置权限
    func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
           if l.Root == "" {
                  return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
           }
           if err := l.validateID(id); err != nil {
                  return nil, err
           }
           if err := l.Validator.Validate(config); err != nil {
                  return nil, newGenericError(err, ConfigInvalid)
           }
           containerRoot := filepath.Join(l.Root, id)
           if _, err := os.Stat(containerRoot); err == nil {
                  return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
           } else if !os.IsNotExist(err) {
                  return nil, newGenericError(err, SystemError)
           }
           if err := os.MkdirAll(containerRoot, 0711); err != nil {
                  return nil, newGenericError(err, SystemError)
           }
           if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
                  return nil, newGenericError(err, SystemError)
           }
           if config.Rootless {
                  RootlessCgroups(l)
           }
           c := &linuxContainer{
                  id:            id,
                  root:          containerRoot,
                  config:        config,
                  initArgs:      l.InitArgs,
                  criuPath:      l.CriuPath,
                  cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
           }
           c.state = &stoppedState{c: c}
           return c, nil
    }

        成功会返回 linuxContainer 结构体,包括容器 ID,跟路径,参数等,将状态置为 stopped
    type linuxContainer struct {
           id                   string
           root                 string
           config               *configs.Config
           cgroupManager        cgroups.Manager
           initArgs             []string
           initProcess          parentProcess
           initProcessStartTime string
           criuPath             string
           m                    sync.Mutex
           criuVersion          int
           state                containerState
           created              time.Time
    }

        

    二. Factory 分析


        2.1 Factory对象为容器创建和初始化工作提供了一组抽象接口
    type Factory interface {
           Create(id string, config *configs.Config) (Container, error)
    
           Load(id string) (Container, error)
           StartInitialization() error
           Type() string
    }
    • Create: id 和配置参数创建容器,返回一个运行的进程。
    • Load:  从容器为 id 目录下读取 state.json 来载入容器

        2.2 Linux 系统 factory 对象的结构体,实现了 Factory 接口
    type LinuxFactory struct {
           // Root directory for the factory to store state.
           Root string
    
           // InitArgs are arguments for calling the init responsibilities for spawning
           // a container.
           InitArgs []string
    
           // CriuPath is the path to the criu binary used for checkpoint and restore of
           // containers.
           CriuPath string
    
           // Validator provides validation to container configurations.
           Validator validate.Validator
    
           // NewCgroupsManager returns an initialized cgroups manager for a single container.
           NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager
    }


    三. runner 分析


         runner 结构体,运行容器的信息 
    type runner struct {
           enableSubreaper bool
           shouldDestroy   bool
           detach          bool
           listenFDs       []*os.File
           preserveFDs     int
           pidFile         string
           consoleSocket   string
           container       libcontainer.Container
         action          CtAct
           notifySocket    *notifySocket
           criuOpts        *libcontainer.CriuOpts
    }
        
        Process 结构体,启动容器内进程的信息
    type Process struct {
           // Terminal creates an interactive terminal for the container.
           Terminal bool `json:"terminal,omitempty"`
           // ConsoleSize specifies the size of the console.
           ConsoleSize *Box `json:"consoleSize,omitempty"`
           // User specifies user information for the process.
           User User `json:"user"`
           // Args specifies the binary and arguments for the application to execute.
           Args []string `json:"args"`
           // Env populates the process environment for the process.
           Env []string `json:"env,omitempty"`
           // Cwd is the current working directory for the process and must be
           // relative to the container's root.
           Cwd string `json:"cwd"`
           // Capabilities are Linux capabilities that are kept for the process.
           Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"`
           // Rlimits specifies rlimit options to apply to the process.
           Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris"`
           // NoNewPrivileges controls whether additional privileges could be gained by processes in the container.
           NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"`
           // ApparmorProfile specifies the apparmor profile for the container.
           ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"`
           // Specify an oom_score_adj for the container.
           OOMScoreAdj *int `json:"oomScoreAdj,omitempty" platform:"linux"`
           // SelinuxLabel specifies the selinux context that the container process is run as.
           SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
    }

        3.1 run 函数
    func (r *runner) run(config *specs.Process) (int, error)

        3.1.1 newProcess 主要是填充 libcontainer.Process 结构体,包括参数,环境变量,user 权限,工作目录,cpabilities,资源限制等
    func newProcess(p specs.Process) (*libcontainer.Process, error) {
           lp := &libcontainer.Process{
                  Args: p.Args,
                  Env:  p.Env,
                  // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
                  User:            fmt.Sprintf("%d:%d", p.User.UID, p.User.GID),
                  Cwd:             p.Cwd,
                  Label:           p.SelinuxLabel,
                  NoNewPrivileges: &p.NoNewPrivileges,
                  AppArmorProfile: p.ApparmorProfile,
           }
           if p.Capabilities != nil {
                  lp.Capabilities = &configs.Capabilities{}
                  lp.Capabilities.Bounding = p.Capabilities.Bounding
                  lp.Capabilities.Effective = p.Capabilities.Effective
                  lp.Capabilities.Inheritable = p.Capabilities.Inheritable
                  lp.Capabilities.Permitted = p.Capabilities.Permitted
                  lp.Capabilities.Ambient = p.Capabilities.Ambient
           }
           for _, gid := range p.User.AdditionalGids {
                  lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10))
           }
           for _, rlimit := range p.Rlimits {
                  rl, err := createLibContainerRlimit(rlimit)
                  if err != nil {
                         return nil, err
                  }
                  lp.Rlimits = append(lp.Rlimits, rl)
           }
           return lp, nil
    }

        3.1.2 listen fd 加入 process 的环境变量和需要在新进程保持打开的文件列表中(ExtraFiles
    if len(r.listenFDs) > 0 {
           process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
           process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
    }
    baseFd := 3 + len(process.ExtraFiles)
    for i := baseFd; i < baseFd+r.preserveFDs; i++ {
           process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
    }

        3.1.3 创建 signalHandler 处理 tty 和 signal,setupIO 来进行 io 和 tty 相关配置,对于 create 就是 dup 将当前进程的 io,chown 用户/组权限
    // Setting up IO is a two stage process. We need to modify process to deal
    // with detaching containers, and then we get a tty after the container has
    // started.
    handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
    tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
    if err != nil {
           r.destroy()
           return -1, err
    }

        3.1.4 根据 action 为 create,第四章继续分析 r.container.Start 函数
    switch r.action {
    case CT_ACT_CREATE:
           err = r.container.Start(process)
    case CT_ACT_RESTORE:
           err = r.container.Restore(process, r.criuOpts)
    case CT_ACT_RUN:
           err = r.container.Run(process)
    default:
           panic("Unknown action")
    }


    四. container 分析


        4.0 结构体及接口
        Container对象主要包含了容器配置、控制、状态显示等功能,每一个 Container 进程内部都是线程安全的。由于 Container 可能被其他进程销毁,所以每个方法都会对容器是否存在进行检测。
        Container 接口:
    • ID():返回容器的 ID
    • Status(): 返回容器的当前状态
    • State(): 返回运行容器状态信息,包括容器ID,初始进程ID,初始进程启动时间,配置信息,cgroup 路径,namespace 路径
    • Config(): 返回当前容器的配置
    • Processes(): 返回容器内 PIDs
    • Stats(): 返回容器的统计信息
    • Start(): 在容器内启动一个进程
    type Container interface {
           BaseContainer
    
           Checkpoint(criuOpts *CriuOpts) error
           Restore(process *Process, criuOpts *CriuOpts) error
    
           Pause() error
           Resume() error
    
           NotifyOOM() (<-chan struct{}, error)
           NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
    }
    type BaseContainer interface {
           ID() string
    
           Status() (Status, error)
           State() (*State, error)
           Config() configs.Config
           Processes() ([]int, error)
           Stats() (*Stats, error)
           
           Set(config configs.Config) error
           Start(process *Process) (err error)
           Run(process *Process) (err error)
           Destroy() error
           Signal(s os.Signal, all bool) error
           Exec() error
    }

        State 结构体,表示一个运行中的容器状态信息:
    // State represents a running container's state
    type State struct {
           BaseState
    
           // Platform specific fields below here
    
           // Specifies if the container was started under the rootless mode.
           Rootless bool `json:"rootless"`
    
           // Path to all the cgroups setup for a container. Key is cgroup subsystem name
           // with the value as the path.
           CgroupPaths map[string]string `json:"cgroup_paths"`
    
           // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
           // with the value as the path.
           NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
    
           // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
           ExternalDescriptors []string `json:"external_descriptors,omitempty"`
    }
    type BaseState struct {
           // ID is the container ID.
           ID string `json:"id"`
    
           // InitProcessPid is the init process id in the parent namespace.
           InitProcessPid int `json:"init_process_pid"`
    
           // InitProcessStartTime is the init process start time in clock cycles since boot time.
           InitProcessStartTime uint64 `json:"init_process_start"`
    
           // Created is the unix timestamp for the creation time of the container in UTC
           Created time.Time `json:"created"`
    
           // Config is the container's configuration.
           Config configs.Config `json:"config"`
    }

        4.1 Start 根据 create 流程向前将 status 置为 stopped 状态,传入 start 函数第二个参数为 true 将状态设置为 created,主要 4.2 分析 start 函数
    func (c *linuxContainer) Start(process *Process) error {
           c.m.Lock()
           defer c.m.Unlock()
           status, err := c.currentStatus()
           if err != nil {
                  return err
           }
           if status == Stopped {
                  if err := c.createExecFifo(); err != nil {
                         return err
                  }
           }
           if err := c.start(process, status == Stopped); err != nil {
                  if status == Stopped {
                         c.deleteExecFifo()
                  }
                  return err
           }
           return nil
    }

        4.2 start 函数分为如下讲解
    func (c *linuxContainer) start(process *Process, isInit bool) error

        4.2.1 newParentProcess
    • 创建一对pipe,parentPipe和childPipe,作为 runc start 进程与容器内部 init 进程通信管道
    • 创建一个命令模版作为 Parent 进程启动的模板
    • newInitProcess 封装 initProcess。主要工作为添加初始化类型环境变量,将namespace、uid/gid 映射等信息使用 bootstrapData 封装为一个 io.Reader
    func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
           parentPipe, childPipe, err := utils.NewSockPair("init")
       
           cmd, err := c.commandTemplate(p, childPipe)
      
           if !doInit {
                  return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
           }
    
           // We only set up rootDir if we're not doing a `runc exec`. The reason for
           // this is to avoid cases where a racing, unprivileged process inside the
           // container can get access to the statedir file descriptor (which would
           // allow for container rootfs escape).
           rootDir, err := os.Open(c.root)
         
           cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
           cmd.Env = append(cmd.Env,
                  fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
           return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
    }

        4.2.1.1newInitProcess 添加初始化类型环境变量,将namespace、uid/gid 映射等信息使用 bootstrapData 函数封装为一个 io.Reader,使用的是 netlink 用于内核间的通信,返回 initProcess 结构体
    func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
           cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
           nsMaps := make(map[configs.NamespaceType]string)
           for _, ns := range c.config.Namespaces {
                  if ns.Path != "" {
                         nsMaps[ns.Type] = ns.Path
                  }
           }
           _, sharePidns := nsMaps[configs.NEWPID]
           data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
           if err != nil {
                  return nil, err
           }
           return &initProcess{
                  cmd:           cmd,
                  childPipe:     childPipe,
                  parentPipe:    parentPipe,
                  manager:       c.cgroupManager,
                  config:        c.newInitConfig(p),
                  container:     c,
                  process:       p,
                  bootstrapData: data,
                  sharePidns:    sharePidns,
                  rootDir:       rootDir,
           }, nil
    }

        4.2.2 parent.start 函数第五章节讲解,创建了新的进程。而此时新的进程使用 /proc/self/exec 为执行入口,参数为 init,会在 main 函数调用之前执行,所以在新的进程中 func init() 会直接调用,而不会去执行main函数
    unc (p *initProcess) start() error


    五. initProcess 分析


        5.0 initProcess 结构体
    type initProcess struct {
           cmd           *exec.Cmd
           parentPipe    *os.File
           childPipe     *os.File
           config        *initConfig
           manager       cgroups.Manager
           container     *linuxContainer
           fds           []string
           process       *Process
           bootstrapData io.Reader
           sharePidns    bool
           rootDir       *os.File
    }

        5.1 start 函数以下讲解
    func (p *initProcess) start() error

        5.1.1  中 cmd 如最后命令所示,Path填充为 /proc/self/exe(本身 runC)。参数字段 Args 为 init,表示对容器进行初始化,调用的为 runc init 第六章节讲解
    defer p.parentPipe.Close()
    err := p.cmd.Start()
    p.process.ops = p
    p.childPipe.Close()
    p.rootDir.Close()
    if err != nil {
           p.process.ops = nil
           return newSystemErrorWithCause(err, "starting init process command")
    }

        5.1.2  Apply 设置进程 cgroup 进行限额
    if err := p.manager.Apply(p.pid()); err != nil {
           return newSystemErrorWithCause(err, "applying cgroup configuration for process")
    }

        5.1.3  createNetworkInterfaces  如果没有指定网络只设置 loopback,如果指定网络还有 veth 类型。
    if err := p.createNetworkInterfaces(); err != nil {
           return newSystemErrorWithCause(err, "creating network interfaces")
    }

        5.1.4  sendConfig 发送配置到 init process
    if err := p.sendConfig(); err != nil {
           return newSystemErrorWithCause(err, "sending config to init process")
    }



    六. runc init 分析


     【runc 源码分析】runc create / start 流程分析
                                                                      网上的图片
        6.0 根据命令行参数 init
    func init() {
           if len(os.Args) > 1 && os.Args[1] == "init" {
                  runtime.GOMAXPROCS(1)
                  runtime.LockOSThread()
           }
    }
    
    var initCommand = cli.Command{
           Name:  "init",
           Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
           Action: func(context *cli.Context) error {
                  factory, _ := libcontainer.New("")
                  if err := factory.StartInitialization(); err != nil {
                         // as the error is sent back to the parent there is no need to log
                         // or write it to stderr because the parent process will handle this
                         os.Exit(1)
                  }
                  panic("libcontainer: container init failed to exec")
           },
    }

        6.1 StartInitialization 函数中 
    func (l *LinuxFactory) StartInitialization() (err error) {
           var (
                  pipefd, rootfd int
                  consoleSocket  *os.File
                  envInitPipe    = os.Getenv("_LIBCONTAINER_INITPIPE")
                  envStateDir    = os.Getenv("_LIBCONTAINER_STATEDIR")
                  envConsole     = os.Getenv("_LIBCONTAINER_CONSOLE")
           )
           ......
    
           i, err := newContainerInit(it, pipe, consoleSocket, rootfd)
           if err != nil {
                  return err
           }
    
           // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
           return i.Init()
    }

        6.1.1 newContainerInit 根据类型返回含有 Init 方法的结构体
    func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD int) (initer, error) {
           var config *initConfig
           if err := json.NewDecoder(pipe).Decode(&config); err != nil {
                  return nil, err
           }
           if err := populateProcessEnvironment(config.Env); err != nil {
                  return nil, err
           }
           switch t {
           case initSetns:
                  return &linuxSetnsInit{
                         pipe:          pipe,
                         consoleSocket: consoleSocket,
                         config:        config,
                  }, nil
           case initStandard:
                  return &linuxStandardInit{
                         pipe:          pipe,
                         consoleSocket: consoleSocket,
                         parentPid:     unix.Getppid(),
                         config:        config,
                         stateDirFD:    stateDirFD,
                  }, nil
           }
           return nil, fmt.Errorf("unknown init type %q", t)
    }

        6.1.2 如果类型为 initStandard 则结构体 linuxStandardInit 实现了 Init 方法,容器内初始化做了一大堆事,路径 libcontainer/standard_init_linux.go
    • setupNetwork: 配置容器的网络,调用第三方 netlink.LinkSetup
    • setupRoute: 配置容器静态路由信息,调用第三方 netlink.RouteAdd
    • label.Init:   检查selinux是否被启动并将结果存入全局变量。
    • finalizeNamespace: 根据config配置将需要的特权capabilities加入白名单,设置user namespace,关闭不需要的文件描述符。
    • unix.Openat: 只写方式打开fifo管道并写入0,会一直保持阻塞,直到管道的另一端以读方式打开,并读取内容
    • syscall.Exec 系统调用来执行用户所指定的在容器中运行的程序
       配置 hostname、apparmor、processLabel、sysctl、readonlyPath、maskPath。create 虽然不会执行命令,但会检查命令路径,错误会在 create 期间返回
    func (l *linuxStandardInit) Init() error {
           if err := setupNetwork(l.config); err != nil {
        
           if err := setupRoute(l.config.Config); err != nil {
        
           label.Init()
    
           // Finish the rootfs setup.
           if l.config.Config.Namespaces.Contains(configs.NEWNS) {
                  if err := finalizeRootfs(l.config.Config); err != nil {
                 
           }
    
           if err := syncParentReady(l.pipe); err != nil {
     
           if err := finalizeNamespace(l.config); err != nil {
    
           if err := pdeath.Restore(); err != nil {
      
           name, err := exec.LookPath(l.config.Args[0])
     
           fd, err := unix.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|unix.O_CLOEXEC, 0)
         
           if _, err := unix.Write(fd, []byte("0")); err != nil {
                  return newSystemErrorWithCause(err, "write 0 exec fifo")
           }
           if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
                  if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
                         return newSystemErrorWithCause(err, "init seccomp")
                  }
           }
           // close the statedir fd before exec because the kernel resets dumpable in the wrong order
           // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
           unix.Close(l.stateDirFD)
           if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
                  return newSystemErrorWithCause(err, "exec user process")
           }
           return nil
    }





    运行一个容器


         创建容器标准包,使用 bundle 模块实现,将 docker 镜像转换成容器标准包
               $ docker pull busybox
               $ docker export $(docker create busybox) | tar -C rootfs -xvf -
         创建配置文件
               $ runc spec
         运行容器 
               $ runc run busybox     


    create 总结:
    • 创建容器的 linux factory,然后调用其 create 方法,这个比较简单,主要是对参数进行校验工作,并创建根目录并进行 uid,gid 权限,返回创建容器的结构体
    • 封装一个 runner 结构体,调用 run 方法,将 config.json 中 process 填充到结构体,根据 action 为创建的调用 container.start(process)
    • 创建管道与容器内进行通信,一个 init process 结构体封装了命令,配置,cgroup,带有 namespace 的 netlink 请求 
    • 容器内执行 runc init 命令,配置 cgroup,创建网络接口,通过管道发送配置给容器内
    • 容器内从管道读取配置进行初始化配置



    cmd 内容:
    {
        Path: "/proc/self/exe",
        Args: [
            
        ]string{
            "/proc/self/exe",
            "init"

        },
        Env: [
            
        ]string{
            "_LIBCONTAINER_CONSOLE=3",
            "_LIBCONTAINER_INITPIPE=4",
            "_LIBCONTAINER_STATEDIR=5",
            "_LIBCONTAINER_INITTYPE=standard"
        },
        Dir: "/home/lin/project/src/github.com/opencontainers/runc/mycontainers/rootfs",
        Stdin: io.Reader(nil),
        Stdout: io.Writer(nil),
        Stderr: io.Writer(nil),
        ExtraFiles: [
            
        ]*os.File{
            (*os.File)(0xc42000e120),
            (*os.File)(0xc42000e140),
            (*os.File)(0xc42000e150)
        },
        SysProcAttr: (*syscall.SysProcAttr)(0xc4200981b0),
        Process: (*os.Process)(nil),
        ProcessState: (*os.ProcessState)(nil),
        ctx: context.Context(nil),
        lookPathErr: error(nil),
        finished: false,
        childFiles: [
            
        ]*os.File(nil),
        closeAfterStart: [
            
        ]io.Closer(nil),
        closeAfterWait: [
            
        ]io.Closer(nil),
        goroutine: [
            
        ]func()error(nil),
        errch: (chanerror)(nil),
        waitDone: (chanstruct{
            
        })(nil)
    }


    /var/run/runc/${container-id}/state.json
    {
        "id": "container-bbbb",
        "init_process_pid": 3193,
        "init_process_start": 7331,
        "created": "2017-08-15T02:30:51.244343167Z",
        "config": {
            "no_pivot_root": false,
            "parent_death_signal": 0,
            "rootfs": "/home/lin/project/src/github.com/opencontainers/runc/mycontainers/rootfs",
            "readonlyfs": true,
            "rootPropagation": 278528,
            "mounts": [
                {
                    "source": "proc",
                    "destination": "/proc",
                    "device": "proc",
                    "flags": 0,
                    "propagation_flags": null,
                    "data": "",
                    "relabel": "",
                    "extensions": 0,
                    "premount_cmds": null,
                    "postmount_cmds": null
                },
                {
                    "source": "tmpfs",
                    "destination": "/dev",
                    "device": "tmpfs",
                    "flags": 16777218,
                    "propagation_flags": null,
                    "data": "mode=755,size=65536k",
                    "relabel": "",
                    "extensions": 0,
                    "premount_cmds": null,
                    "postmount_cmds": null
                },
                {
                    "source": "devpts",
                    "destination": "/dev/pts",
                    "device": "devpts",
                    "flags": 10,
                    "propagation_flags": null,
                    "data": "newinstance,ptmxmode=0666,mode=0620,gid=5",
                    "relabel": "",
                    "extensions": 0,
                    "premount_cmds": null,
                    "postmount_cmds": null
                },
                {
                    "source": "shm",
                    "destination": "/dev/shm",
                    "device": "tmpfs",
                    "flags": 14,
                    "propagation_flags": null,
                    "data": "mode=1777,size=65536k",
                    "relabel": "",
                    "extensions": 0,
                    "premount_cmds": null,
                    "postmount_cmds": null
                },
                {
                    "source": "mqueue",
                    "destination": "/dev/mqueue",
                    "device": "mqueue",
                    "flags": 14,
                    "propagation_flags": null,
                    "data": "",
                    "relabel": "",
                    "extensions": 0,
                    "premount_cmds": null,
                    "postmount_cmds": null
                },
                {
                    "source": "sysfs",
                    "destination": "/sys",
                    "device": "sysfs",
                    "flags": 15,
                    "propagation_flags": null,
                    "data": "",
                    "relabel": "",
                    "extensions": 0,
                    "premount_cmds": null,
                    "postmount_cmds": null
                },
                {
                    "source": "cgroup",
                    "destination": "/sys/fs/cgroup",
                    "device": "cgroup",
                    "flags": 2097167,
                    "propagation_flags": null,
                    "data": "",
                    "relabel": "",
                    "extensions": 0,
                    "premount_cmds": null,
                    "postmount_cmds": null
                }
            ],
            "devices": [
                {
                    "type": 99,
                    "path": "/dev/null",
                    "major": 1,
                    "minor": 3,
                    "permissions": "",
                    "file_mode": 438,
                    "uid": 0,
                    "gid": 0,
                    "allow": false
                },
                {
                    "type": 99,
                    "path": "/dev/random",
                    "major": 1,
                    "minor": 8,
                    "permissions": "",
                    "file_mode": 438,
                    "uid": 0,
                    "gid": 0,
                    "allow": false
                },
                {
                    "type": 99,
                    "path": "/dev/full",
                    "major": 1,
                    "minor": 7,
                    "permissions": "",
                    "file_mode": 438,
                    "uid": 0,
                    "gid": 0,
                    "allow": false
                },
                {
                    "type": 99,
                    "path": "/dev/tty",
                    "major": 5,
                    "minor": 0,
                    "permissions": "",
                    "file_mode": 438,
                    "uid": 0,
                    "gid": 0,
                    "allow": false
                },
                {
                    "type": 99,
                    "path": "/dev/zero",
                    "major": 1,
                    "minor": 5,
                    "permissions": "",
                    "file_mode": 438,
                    "uid": 0,
                    "gid": 0,
                    "allow": false
                },
                {
                    "type": 99,
                    "path": "/dev/urandom",
                    "major": 1,
                    "minor": 9,
                    "permissions": "",
                    "file_mode": 438,
                    "uid": 0,
                    "gid": 0,
                    "allow": false
                }
            ],
            "mount_label": "",
            "hostname": "runc",
            "namespaces": [
                {
                    "type": "NEWPID",
                    "path": ""
                },
                {
                    "type": "NEWNET",
                    "path": ""
                },
                {
                    "type": "NEWIPC",
                    "path": ""
                },
                {
                    "type": "NEWUTS",
                    "path": ""
                },
                {
                    "type": "NEWNS",
                    "path": ""
                }
            ],
            "capabilities": {
                "Bounding": [
                    "CAP_AUDIT_WRITE",
                    "CAP_KILL",
                    "CAP_NET_BIND_SERVICE"
                ],
                "Effective": [
                    "CAP_AUDIT_WRITE",
                    "CAP_KILL",
                    "CAP_NET_BIND_SERVICE"
                ],
                "Inheritable": [
                    "CAP_AUDIT_WRITE",
                    "CAP_KILL",
                    "CAP_NET_BIND_SERVICE"
                ],
                "Permitted": [
                    "CAP_AUDIT_WRITE",
                    "CAP_KILL",
                    "CAP_NET_BIND_SERVICE"
                ],
                "Ambient": [
                    "CAP_AUDIT_WRITE",
                    "CAP_KILL",
                    "CAP_NET_BIND_SERVICE"
                ]
            },
            "networks": [
                {
                    "type": "loopback",
                    "name": "",
                    "bridge": "",
                    "mac_address": "",
                    "address": "",
                    "gateway": "",
                    "ipv6_address": "",
                    "ipv6_gateway": "",
                    "mtu": 0,
                    "txqueuelen": 0,
                    "host_interface_name": "",
                    "hairpin_mode": false
                }
            ],
            "routes": null,
            "cgroups": {
                "name": "container-bbbb",
                "path": "",
                "scope_prefix": "",
                "Paths": null,
                "allowed_devices": [
                    {
                        "type": 99,
                        "path": "",
                        "major": -1,
                        "minor": -1,
                        "permissions": "m",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 98,
                        "path": "",
                        "major": -1,
                        "minor": -1,
                        "permissions": "m",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/null",
                        "major": 1,
                        "minor": 3,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/random",
                        "major": 1,
                        "minor": 8,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/full",
                        "major": 1,
                        "minor": 7,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/tty",
                        "major": 5,
                        "minor": 0,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/zero",
                        "major": 1,
                        "minor": 5,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/urandom",
                        "major": 1,
                        "minor": 9,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/console",
                        "major": 5,
                        "minor": 1,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "",
                        "major": 136,
                        "minor": -1,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "",
                        "major": 5,
                        "minor": 2,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "",
                        "major": 10,
                        "minor": 200,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    }
                ],
                "devices": [
                    {
                        "type": 97,
                        "path": "",
                        "major": -1,
                        "minor": -1,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": false
                    },
                    {
                        "type": 99,
                        "path": "",
                        "major": -1,
                        "minor": -1,
                        "permissions": "m",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 98,
                        "path": "",
                        "major": -1,
                        "minor": -1,
                        "permissions": "m",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/null",
                        "major": 1,
                        "minor": 3,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/random",
                        "major": 1,
                        "minor": 8,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/full",
                        "major": 1,
                        "minor": 7,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/tty",
                        "major": 5,
                        "minor": 0,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/zero",
                        "major": 1,
                        "minor": 5,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/urandom",
                        "major": 1,
                        "minor": 9,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "/dev/console",
                        "major": 5,
                        "minor": 1,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "",
                        "major": 136,
                        "minor": -1,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "",
                        "major": 5,
                        "minor": 2,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    },
                    {
                        "type": 99,
                        "path": "",
                        "major": 10,
                        "minor": 200,
                        "permissions": "rwm",
                        "file_mode": 0,
                        "uid": 0,
                        "gid": 0,
                        "allow": true
                    }
                ],
                "memory": 0,
                "memory_reservation": 0,
                "memory_swap": 0,
                "kernel_memory": 0,
                "kernel_memory_tcp": 0,
                "cpu_shares": 0,
                "cpu_quota": 0,
                "cpu_period": 0,
                "cpu_rt_quota": 0,
                "cpu_rt_period": 0,
                "cpuset_cpus": "",
                "cpuset_mems": "",
                "pids_limit": 0,
                "blkio_weight": 0,
                "blkio_leaf_weight": 0,
                "blkio_weight_device": null,
                "blkio_throttle_read_bps_device": null,
                "blkio_throttle_write_bps_device": null,
                "blkio_throttle_read_iops_device": null,
                "blkio_throttle_write_iops_device": null,
                "freezer": "",
                "hugetlb_limit": null,
                "oom_kill_disable": false,
                "memory_swappiness": null,
                "net_prio_ifpriomap": null,
                "net_cls_classid_u": 0
            },
            "oom_score_adj": 0,
            "uid_mappings": null,
            "gid_mappings": null,
            "mask_paths": [
                "/proc/kcore",
                "/proc/latency_stats",
                "/proc/timer_list",
                "/proc/timer_stats",
                "/proc/sched_debug",
                "/sys/firmware"
            ],
            "readonly_paths": [
                "/proc/asound",
                "/proc/bus",
                "/proc/fs",
                "/proc/irq",
                "/proc/sys",
                "/proc/sysrq-trigger"
            ],
            "sysctl": null,
            "seccomp": null,
            "Hooks": {
                "poststart": null,
                "poststop": null,
                "prestart": null
            },
            "version": "1.0.0",
            "labels": [
                "bundle=/home/lin/project/src/github.com/opencontainers/runc/mycontainers"
            ],
            "no_new_keyring": false,
            "rootless": false
        },
        "rootless": false,
        "cgroup_paths": {
            "blkio": "/sys/fs/cgroup/blkio/user.slice/container-bbbb",
            "cpu": "/sys/fs/cgroup/cpu,cpuacct/user.slice/container-bbbb",
            "cpuacct": "/sys/fs/cgroup/cpu,cpuacct/user.slice/container-bbbb",
            "cpuset": "/sys/fs/cgroup/cpuset/container-bbbb",
            "devices": "/sys/fs/cgroup/devices/user.slice/container-bbbb",
            "freezer": "/sys/fs/cgroup/freezer/container-bbbb",
            "hugetlb": "/sys/fs/cgroup/hugetlb/container-bbbb",
            "memory": "/sys/fs/cgroup/memory/user.slice/container-bbbb",
            "name=systemd": "/sys/fs/cgroup/systemd/user.slice/user-1000.slice/session-c2.scope/container-bbbb",
            "net_cls": "/sys/fs/cgroup/net_cls,net_prio/container-bbbb",
            "net_prio": "/sys/fs/cgroup/net_cls,net_prio/container-bbbb",
            "perf_event": "/sys/fs/cgroup/perf_event/container-bbbb",
            "pids": "/sys/fs/cgroup/pids/user.slice/user-1000.slice/container-bbbb"
        },
        "namespace_paths": {
            "NEWIPC": "/proc/3193/ns/ipc",
            "NEWNET": "/proc/3193/ns/net",
            "NEWNS": "/proc/3193/ns/mnt",
            "NEWPID": "/proc/3193/ns/pid",
            "NEWUSER": "/proc/3193/ns/user",
            "NEWUTS": "/proc/3193/ns/uts"
        },
        "external_descriptors": [
            "/dev/null",
            "/dev/null",
            "/dev/null"
        ]
    }


    runc start -h

       runc starat <container-id>


    runc start 命令分析

       路径 start.go,命令行内容,主要是 Action 定义的,如果只是创建的话状态为 created,则执行 container.Exec()

    Action: func(context *cli.Context) error {
           if err := checkArgs(context, 1, exactArgs); err != nil {
                  return err
           }
           container, err := getContainer(context)
           if err != nil {
                  return err
           }
           status, err := container.Status()
           if err != nil {
                  return err
           }
           switch status {
           case libcontainer.Created:
                  return container.Exec()
           case libcontainer.Stopped:
                  return errors.New("cannot start a container that has stopped")
           case libcontainer.Running:
                  return errors.New("cannot start an already running container")
           default:
                  return fmt.Errorf("cannot start a container in the %s state\n", status)
           }
    },


       路径 libcontainer/container_linux.go,主要读取 /var/run/runc/${container-id}/exc.fifo 内容,然后成功读取进行删除该文件,同时恢复阻塞了 Create 的初始化进程。

    func (c *linuxContainer) Exec() error {
           c.m.Lock()
           defer c.m.Unlock()
           return c.exec()
    }
    
    func (c *linuxContainer) exec() error {
           path := filepath.Join(c.root, execFifoFilename)
           f, err := os.OpenFile(path, os.O_RDONLY, 0)
           if err != nil {
                  return newSystemErrorWithCause(err, "open exec fifo for reading")
           }
           defer f.Close()
           data, err := ioutil.ReadAll(f)
           if err != nil {
                  return err
           }
           if len(data) > 0 {
                  os.Remove(path)
                  return nil
           }
           return fmt.Errorf("cannot start an already running container")
    }

    相关文章: