dream397

docker系列--namespace解读

 

// Config defines configuration options for executing a process inside a contained environment.
type Config struct {
    ...
 
    // Namespaces specifies the container\'s namespaces that it should setup when cloning the init process
    // If a namespace is not provided that namespace is shared from the container\'s parent process
    Namespaces Namespaces `json:"namespaces"`
 
    // UidMappings is an array of User ID mappings for User Namespaces
    UidMappings []IDMap `json:"uid_mappings"`
 
    // GidMappings is an array of Group ID mappings for User Namespaces
    GidMappings []IDMap `json:"gid_mappings"`
 
    ...
}

runC中namespace的源码主要在: runc/libcontainer/configs/namespaces_unix.go runC支持的namespce type包括($nsName) "net"、"mnt"、"pid"、"ipc"、"user"、"uts":

const (
       NEWNET  NamespaceType = "NEWNET"
       NEWPID  NamespaceType = "NEWPID"
       NEWNS   NamespaceType = "NEWNS"
       NEWUTS  NamespaceType = "NEWUTS"
       NEWIPC  NamespaceType = "NEWIPC"
       NEWUSER NamespaceType = "NEWUSER"
)

除了验证 Namespce Type是否在以上常量中,还要去验证 /proc/self/ns/$nsName是否存在并且可以read,都通过时,才认为该Namespace是在当前系统中是被支持的。

root@cloud:~/iso# ls /proc/self/ns/ -al
total 0
dr-x--x--x 2 root root 0 Dec  4 14:51 .
dr-xr-xr-x 9 root root 0 Dec  4 14:51 ..
lrwxrwxrwx 1 root root 0 Dec  4 14:51 cgroup -> \'cgroup:[4026531835]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:51 ipc -> \'ipc:[4026531839]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:51 mnt -> \'mnt:[4026531840]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:51 net -> \'net:[4026531896]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:51 pid -> \'pid:[4026531836]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:51 pid_for_children -> \'pid:[4026531836]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:51 user -> \'user:[4026531837]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:51 uts -> \'uts:[4026531838]\'
root@cloud:~/iso# 

 

root@cloud:~/iso# unshare -m -u --propagation unchanged /bin/bash

root@cloud:~/iso# ls /proc/self/ns/ -al
total 0
dr-x--x--x 2 root root 0 Dec  4 14:52 .
dr-xr-xr-x 9 root root 0 Dec  4 14:52 ..
lrwxrwxrwx 1 root root 0 Dec  4 14:52 cgroup -> \'cgroup:[4026531835]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:52 ipc -> \'ipc:[4026531839]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:52 mnt -> \'mnt:[4026533784]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:52 net -> \'net:[4026531896]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:52 pid -> \'pid:[4026531836]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:52 pid_for_children -> \'pid:[4026531836]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:52 user -> \'user:[4026531837]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:52 uts -> \'uts:[4026533786]\'
root@cloud:~/iso# 

如下是NameSpace的完整定义,很简单,只包括NamespaceType 和对应的Path。

// Namespace defines configuration for each namespace.  It specifies an
// alternate path that is able to be joined via setns.
type Namespace struct {
       Type NamespaceType `json:"type"`
       Path string        `json:"path"`
}

 

 

从Namespace的GetPath方法中可见,一个pid对应的namespace path为 /proc/$pid/ns/$nsName。
func (n *Namespace) GetPath(pid int) string {
       if n.Path != "" {
              return n.Path
       }
       return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
}

 

root@cloud:~/iso# ls /proc/$$/ns/ -al
total 0
dr-x--x--x 2 root root 0 Dec  4 14:55 .
dr-xr-xr-x 9 root root 0 Dec  4 14:55 ..
lrwxrwxrwx 1 root root 0 Dec  4 14:55 cgroup -> \'cgroup:[4026531835]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:55 ipc -> \'ipc:[4026531839]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:55 mnt -> \'mnt:[4026533784]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:55 net -> \'net:[4026531896]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:55 pid -> \'pid:[4026531836]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:55 pid_for_children -> \'pid:[4026531836]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:55 user -> \'user:[4026531837]\'
lrwxrwxrwx 1 root root 0 Dec  4 14:55 uts -> \'uts:[4026533786]\'
root@cloud:~/iso# 

 

func (c *linuxContainer) start(process *Process) error {
        //if false == cPathExists("/run/sockets/qemu_pipe") {
        //                                       return newSystemErrorWithCausef(nil, "mount bind /run/sockets failed %s , /run/sockets/qemu_pipe not exist", c.config.Rootfs)
        //}
        //input_dir := filepath.Join(c.config.Rootfs, "/vmi/sockets")
        //if err := os.MkdirAll(input_dir, 0777); err != nil {
        //       return newSystemErrorWithCause(err, "mkdir rootfs/sockets/")
        //}
        //if err := unix.Mount("/run/sockets/qemu_pipe", input_dir, "", unix.MS_REC|unix.MS_BIND, ""); err != nil {
        //       return newSystemErrorWithCausef(err, "mount bind /run/sockets failed %s", c.config.Rootfs)
        //}
        parent, err := c.newParentProcess(process)
        if err != nil {
                return newSystemErrorWithCause(err, "creating new parent process")
        }
        parent.forwardChildLogs()
        if err := parent.start(); err != nil {
                // terminate the process to ensure that it properly is reaped.
                if err := ignoreTerminateErrors(parent.terminate()); err != nil {
                        logrus.Warn(err)
                }
                return newSystemErrorWithCause(err, "starting container process")
        }
        // generate a timestamp indicating when the container was started
        c.created = time.Now().UTC()
        if process.Init {
                c.state = &createdState{
                        c: c,
                }
                state, err := c.updateState(parent)
                if err != nil {
                        return err
                }
                c.initProcessStartTime = state.InitProcessStartTime

                if c.config.Hooks != nil {
                        s, err := c.currentOCIState()
                        if err != nil {
                                return err
                        }
                        for i, hook := range c.config.Hooks.Poststart {
                                if err := hook.Run(s); err != nil {
                                        if err := ignoreTerminateErrors(parent.terminate()); err != nil {
                                                logrus.Warn(err)
                                        }
                                        return newSystemErrorWithCausef(err, "running poststart hook %d", i)
                                }
                        }
                }
        }
        return nil
}

 

 

func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
        if process != nil {
                c.initProcess = process
        }
        state, err := c.currentState()
        if err != nil {
                return nil, err
        }
        err = c.saveState(state)
        if err != nil {
                return nil, err
        }
        return state, nil
}

 

 

func (c *linuxContainer) currentState() (*State, error) {
        var (
                startTime           uint64
                externalDescriptors []string
                pid                 = -1
        )
        if c.initProcess != nil {
                pid = c.initProcess.pid()
                startTime, _ = c.initProcess.startTime()
                externalDescriptors = c.initProcess.externalDescriptors()
        }
        intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID())
        if err != nil {
                intelRdtPath = ""
        }
        state := &State{
                BaseState: BaseState{
                        ID:                   c.ID(),
                        Config:               *c.config,
                        InitProcessPid:       pid,
                        InitProcessStartTime: startTime,
                        Created:              c.created,
                },
                Rootless:            c.config.RootlessEUID && c.config.RootlessCgroups,
                CgroupPaths:         c.cgroupManager.GetPaths(),
                IntelRdtPath:        intelRdtPath,
                NamespacePaths:      make(map[configs.NamespaceType]string),
                ExternalDescriptors: externalDescriptors,
        }
        if pid > 0 {
                for _, ns := range c.config.Namespaces {
                        state.NamespacePaths[ns.Type] = ns.GetPath(pid)
                }
                for _, nsType := range configs.NamespaceTypes() {
                        if !configs.IsNamespaceSupported(nsType) {
                                continue
                        }
                        if _, ok := state.NamespacePaths[nsType]; !ok {
                                ns := configs.Namespace{Type: nsType}
                                state.NamespacePaths[ns.Type] = ns.GetPath(pid)
                        }
                }
        }
        return state, nil
}

 

 

 

 

 

除此之外,还定义了以下常用方法:

func (n *Namespaces) Remove(t NamespaceType) bool {...}
 
func (n *Namespaces) Add(t NamespaceType, path string) {...}
 
func (n *Namespaces) index(t NamespaceType) int {...}
 
func (n *Namespaces) Contains(t NamespaceType) bool {...}
 
func (n *Namespaces) PathOf(t NamespaceType) string {...}

在runc/libcontainer/configs/namespaces_syscall.go中,定义了linux clone时这些namespace对应的clone flags。

var namespaceInfo = map[NamespaceType]int{
       NEWNET:  syscall.CLONE_NEWNET,
       NEWNS:   syscall.CLONE_NEWNS,
       NEWUSER: syscall.CLONE_NEWUSER,
       NEWIPC:  syscall.CLONE_NEWIPC,
       NEWUTS:  syscall.CLONE_NEWUTS,
       NEWPID:  syscall.CLONE_NEWPID,
}
 
// CloneFlags parses the container\'s Namespaces options to set the correct
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
       var flag int
       for _, v := range *n {
              if v.Path != "" {
                     continue
              }
              flag |= namespaceInfo[v.Type]
       }
       return uintptr(flag)
}

 

上面的CloneFlags()方法是用来解析linuxContainer的config中的namespace相关的参数,生成clone flags,提供给linuxContainer.bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) 来封装。

 

// bootstrapData encodes the necessary data in netlink binary format
// as a io.Reader.
// Consumer can write the data to a bootstrap program
// such as one that uses nsenter package to bootstrap the container\'s
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
       // create the netlink message
       r := nl.NewNetlinkRequest(int(InitMsg), 0)
 
       // write cloneFlags
       r.AddData(&Int32msg{
              Type:  CloneFlagsAttr,
              Value: uint32(cloneFlags),
       })
 
       // write custom namespace paths
       if len(nsMaps) > 0 {
              nsPaths, err := c.orderNamespacePaths(nsMaps)
              if err != nil {
                     return nil, err
              }
              r.AddData(&Bytemsg{
                     Type:  NsPathsAttr,
                     Value: []byte(strings.Join(nsPaths, ",")),
              })
       }
 
       // write namespace paths only when we are not joining an existing user ns
       _, joinExistingUser := nsMaps[configs.NEWUSER]
       if !joinExistingUser {
              // write uid mappings
              if len(c.config.UidMappings) > 0 {
                     b, err := encodeIDMapping(c.config.UidMappings)
                     if err != nil {
                            return nil, err
                     }
                     r.AddData(&Bytemsg{
                            Type:  UidmapAttr,
                            Value: b,
                     })
              }
 
              // write gid mappings
              if len(c.config.GidMappings) > 0 {
                     b, err := encodeIDMapping(c.config.GidMappings)
                     if err != nil {
                            return nil, err
                     }
                     r.AddData(&Bytemsg{
                            Type:  GidmapAttr,
                            Value: b,
                     })
                     // check if we have CAP_SETGID to setgroup properly
                     pid, err := capability.NewPid(os.Getpid())
                     if err != nil {
                            return nil, err
                     }
                     if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
                            r.AddData(&Boolmsg{
                                   Type:  SetgroupAttr,
                                   Value: true,
                            })
                     }
              }
       }
 
       return bytes.NewReader(r.Serialize()), nil
}

 

linuxContainer.newInitProcess(...)最终会使用linuxContainer.bootstrapData封装的clone flags数据,完成initProcess的构建。

 

func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
       cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
       nsMaps := make(map[configs.NamespaceType]string)
       for _, ns := range c.config.Namespaces {
              if ns.Path != "" {
                     nsMaps[ns.Type] = ns.Path
              }
       }
       _, sharePidns := nsMaps[configs.NEWPID]
       data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
       if err != nil {
              return nil, err
       }
       p.consoleChan = make(chan *os.File, 1)
       return &initProcess{
              cmd:           cmd,
              childPipe:     childPipe,
              parentPipe:    parentPipe,
              manager:       c.cgroupManager,
              config:        c.newInitConfig(p),
              container:     c,
              process:       p,
              bootstrapData: data,
              sharePidns:    sharePidns,
              rootDir:       rootDir,
       }, nil
}

 

 

func (p *initProcess) start() error {
        defer p.messageSockPair.parent.Close()
      
        err := p.cmd.Start()
        p.process.ops = p
        // close the write-side of the pipes (controlled by child)
        p.messageSockPair.child.Close()
        p.logFilePair.child.Close()
        if err != nil {
                p.process.ops = nil
                return newSystemErrorWithCause(err, "starting init process command")
        }
        // Do this before syncing with child so that no children can escape the
        // cgroup. We don\'t need to worry about not doing this and not being root
        // because we\'d be using the rootless cgroup manager in that case.
        if err := p.manager.Apply(p.pid()); err != nil {
                return newSystemErrorWithCause(err, "applying cgroup configuration for process")
        }
        if p.intelRdtManager != nil {
                if err := p.intelRdtManager.Apply(p.pid()); err != nil {
                        return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
                }
        }
        defer func() {
                if err != nil {
                        // TODO: should not be the responsibility to call here
                        p.manager.Destroy()
                        if p.intelRdtManager != nil {
                                p.intelRdtManager.Destroy()
                        }
                }
        }()

        if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
                return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
        }
        childPid, err := p.getChildPid()
        if err != nil {
                return newSystemErrorWithCause(err, "getting the final child\'s pid from pipe")
        }
  1. io.Copy 将 p.bootstrapData 中的数据通过 p.parentPipe 发送给子进程

 

newInitProcess(...)在整个container create的流程中的位置,请参考:runC源码分析之Create/Run Container —— 王涛 如此,namespace在整个container create/run中的源码分析就完整了。

充:runC中container的Spec是从bundle/config.json中解析得到的,见runC的create.go中的setupSpec(context)的调用。

Action: func(context *cli.Context) error {
       if context.NArg() != 1 {
              fmt.Printf("Incorrect Usage.\n\n")
              cli.ShowCommandHelp(context, "create")
              return fmt.Errorf("runc: \"create\" requires exactly one argument")
       }
       if err := revisePidFile(context); err != nil {
              return err
       }
       spec, err := setupSpec(context)
       if err != nil {
              return err
       }
       status, err := startContainer(context, spec, true)
       if err != nil {
              return err
       }

setupSepc(context)会去loadSpec("config.json"):

// setupSpec performs initial setup based on the cli.Context for the container
func setupSpec(context *cli.Context) (*specs.Spec, error) {
       bundle := context.String("bundle")
       if bundle != "" {
              if err := os.Chdir(bundle); err != nil {
                     return nil, err
              }
       }
       spec, err := loadSpec(specConfig)
       if err != nil {
              return nil, err
       }
       notifySocket := os.Getenv("NOTIFY_SOCKET")
       if notifySocket != "" {
              setupSdNotify(spec, notifySocket)
       }
       if os.Geteuid() != 0 {
              return nil, fmt.Errorf("runc should be run as root")
       }
       return spec, nil
}

config.json样例如下,namespace部分见 “.linux.namespaces”。

{
    "ociVersion": "0.4.0",
    "platform": {
        "os": "linux",
        "arch": "amd64"
    },
    "process": {
        "terminal": true,
        "user": {},
        "args": [
            "redis-server",
            "--bind",
            "0.0.0.0"
        ],
        "env": [
            "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
            "TERM=xterm"
        ],
        "cwd": "/",
        "capabilities": [
            "CAP_AUDIT_WRITE",
            "CAP_KILL",
            "CAP_NET_BIND_SERVICE"
        ],
        "rlimits": [
            {
                "type": "RLIMIT_NOFILE",
                "hard": 1024,
                "soft": 1024
            }
        ],
        "noNewPrivileges": true
    },
    "root": {
        "path": "rootfs",
        "readonly": true
    },
    "hostname": "runc",
    "mounts": [
        {
            "destination": "/proc",
            "type": "proc",
            "source": "proc"
        },
        {
            "destination": "/dev",
            "type": "tmpfs",
            "source": "tmpfs",
            "options": [
                "nosuid",
                "strictatime",
                "mode=755",
                "size=65536k"
            ]
        },
        {
            "destination": "/dev/pts",
            "type": "devpts",
            "source": "devpts",
            "options": [
                "nosuid",
                "noexec",
                "newinstance",
                "ptmxmode=0666",
                "mode=0620",
                "gid=5"
            ]
        },
        {
            "destination": "/dev/shm",
            "type": "tmpfs",
            "source": "shm",
            "options": [
                "nosuid",
                "noexec",
                "nodev",
                "mode=1777",
                "size=65536k"
            ]
        },
        {
            "destination": "/dev/mqueue",
            "type": "mqueue",
            "source": "mqueue",
            "options": [
                "nosuid",
                "noexec",
                "nodev"
            ]
        },
        {
            "destination": "/sys",
            "type": "sysfs",
            "source": "sysfs",
            "options": [
                "nosuid",
                "noexec",
                "nodev",
                "ro"
            ]
        }

分类:

技术点:

相关文章: