【问题标题】:Socket can't identify protocol (socket leak)套接字无法识别协议(套接字泄漏)
【发布时间】:2015-12-06 16:26:02
【问题描述】:

我有一个 Go1.5.1 进程/应用程序。当我在该进程上运行/usr/sbin/lsof -p 时,我看到很多“无法识别协议”。

monitor_ 13105 root  101u  sock      0,6      0t0 16960100 can't identify protocol
monitor_ 13105 root  102u  sock      0,6      0t0 21552427 can't identify protocol
monitor_ 13105 root  103u  sock      0,6      0t0 17565091 can't identify protocol
monitor_ 13105 root  104u  sock      0,6      0t0 18476870 can't identify protocol

进程状态/限制/fd

[root@Monitor_q ~]# cat /proc/13105/status 
Name:   monitor_client
State:  S (sleeping)
Tgid:   13105
Pid:    13105
PPid:   13104
TracerPid:  0
Uid:    0   0   0   0
Gid:    0   0   0   0
Utrace: 0
FDSize: 16384
Groups: 
...


[root@Monitor_q ~]# cat /proc/13105/limits 
Limit                     Soft Limit           Hard Limit           Units     
Max cpu time              unlimited            unlimited            seconds   
Max file size             unlimited            unlimited            bytes     
Max data size             unlimited            unlimited            bytes     
Max stack size            10485760             unlimited            bytes     
Max core file size        0                    unlimited            bytes     
Max resident set          unlimited            unlimited            bytes     
Max processes             3870                 3870                 processes 
Max open files            9999                 9999                 files     
Max locked memory         65536                65536                bytes     
Max address space         unlimited            unlimited            bytes     
Max file locks            unlimited            unlimited            locks     
Max pending signals       3870                 3870                 signals   
Max msgqueue size         819200               819200               bytes     
Max nice priority         0                    0                    
Max realtime priority     0                    0                    
Max realtime timeout      unlimited            unlimited            us

[root@Monitor_q ~]# ll /proc/13105/fd/
lrwx------ 1 root root 64 Dec  7 00:15 8382 -> socket:[52023221]
lrwx------ 1 root root 64 Dec  7 00:15 8383 -> socket:[51186627]
lrwx------ 1 root root 64 Dec  7 00:15 8384 -> socket:[51864232]
lrwx------ 1 root root 64 Dec  7 00:15 8385 -> socket:[52435453]
lrwx------ 1 root root 64 Dec  7 00:15 8386 -> socket:[51596071]
lrwx------ 1 root root 64 Dec  7 00:15 8387 -> socket:[52767667]
lrwx------ 1 root root 64 Dec  7 00:15 8388 -> socket:[52090632]
lrwx------ 1 root root 64 Dec  7 00:15 8389 -> socket:[51739068]
lrwx------ 1 root root 64 Dec  7 00:15 839 -> socket:[22963529]
lrwx------ 1 root root 64 Dec  7 00:15 8390 -> socket:[52023223]
lrwx------ 1 root root 64 Dec  7 00:15 8391 -> socket:[52560389]
lrwx------ 1 root root 64 Dec  7 00:15 8392 -> socket:[52402565]
...

netstat -a中没有类似的输出。

这些套接字是什么?我怎样才能知道它们的作用?

monitor_client.go

package main

import (
    "crypto/tls"
    "encoding/json"
    "fmt"
    "log"
    "net"
    "net/http"
    nurl "net/url"
    "strconv"
    "strings"
    "syscall"
    "time"
)

type Result struct {
    Error      string        `json:"error"`
    HttpStatus int           `json:"http_status"`
    Stime      time.Duration `json:"http_time"`
}

//http://stackoverflow.com/questions/20990332/golang-http-timeout-and-goroutines-accumulation
//http://3.3.3.3/http?host=3.2.4.2&servername=a.test&path=/&port=33&timeout=5&scheme=http
func MonitorHttp(w http.ResponseWriter, r *http.Request) {
    var host, servername, path, port, scheme string
    var timeout int
    u, err := nurl.Parse(r.RequestURI)
    if err != nil {
        log.Fatal(err)
        return
    }
    if host = u.Query().Get("host"); host == "" {
        host = "127.0.0.0"
    }
    if servername = u.Query().Get("servername"); servername == "" {
        servername = "localhost"
    }
    if path = u.Query().Get("path"); path == "" {
        path = "/"
    }
    if port = u.Query().Get("port"); port == "" {
        port = "80"
    }
    if scheme = u.Query().Get("scheme"); scheme == "" {
        scheme = "http"
    }

    if timeout, _ = strconv.Atoi(u.Query().Get("timeout")); timeout == 0 {
        timeout = 5
    }

    //log.Printf("(host)=%s (servername)=%s (path)=%s (port)=%s (timeout)=%d", host, servername, path, port, timeout)

    w.Header().Set("Content-Type", "application/json")

    res := httptool(host, port, servername, scheme, path, timeout)
    result, _ := json.Marshal(res)
    fmt.Fprintf(w, "%s", result)
}

func httptool(ip, port, servername, scheme, path string, timeout int) Result {

    var result Result
    startTime := time.Now()
    host := ip + ":" + port

    transport := &http.Transport{
        TLSClientConfig:   &tls.Config{InsecureSkipVerify: true},
        DisableKeepAlives: true,
    }

    dialer := net.Dialer{
        Timeout:   time.Duration(timeout) * time.Second,
        KeepAlive: 0 * time.Second,
    }
    transport.Dial = func(network, address string) (net.Conn, error) {
        return dialer.Dial(network, address)
    }

    client := &http.Client{
        Transport: transport,
    }
    rawquery := ""
    url := fmt.Sprintf("%s://%s%s%s", scheme, host, path, rawquery)
    req, err := http.NewRequest("GET", url, nil)
    if err != nil {
        result.HttpStatus = -1
        errs := strings.Split(err.Error(), ": ")
        result.Error = errs[len(errs)-1]
        result.Stime = time.Now().Sub(startTime) / time.Millisecond
        return result
    }
    req.Header.Set("User-Agent", "monitor worker")
    req.Header.Set("Connection", "close")
    req.Host = servername
    resp, err := client.Do(req)
    //https://github.com/Basiclytics/neverdown/blob/master/check.go
    if err != nil {
        nerr, ok := err.(*nurl.Error)
        if ok {
            switch cerr := nerr.Err.(type) {
            case *net.OpError:
                switch cerr.Err.(type) {
                case *net.DNSError:
                    errs := strings.Split(cerr.Error(), ": ")
                    result.Error = "dns: " + errs[len(errs)-1]
                default:
                    errs := strings.Split(cerr.Error(), ": ")
                    result.Error = "server: " + errs[len(errs)-1]
                }
            default:
                switch nerr.Err.Error() {
                case "net/http: request canceled while waiting for connection":
                    errs := strings.Split(cerr.Error(), ": ")
                    result.Error = "timeout: " + errs[len(errs)-1]

                default:
                    errs := strings.Split(cerr.Error(), ": ")
                    result.Error = "unknown: " + errs[len(errs)-1]
                }
            }

        } else {
            result.Error = "unknown: " + err.Error()
        }
        result.HttpStatus = -2
        result.Stime = time.Now().Sub(startTime) / time.Millisecond
        return result
    }
    resp.Body.Close()
    result.HttpStatus = resp.StatusCode
    result.Error = "noerror"
    result.Stime = time.Now().Sub(startTime) / time.Millisecond //spend time (ms)
    return result
}

func setRlimit() {
    var rLimit syscall.Rlimit
    err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &rLimit)
    if err != nil {
        log.Printf("Unable to obtain rLimit", err)
    }
    if rLimit.Cur < rLimit.Max {
        rLimit.Max = 9999
        rLimit.Cur = 9999
        err = syscall.Setrlimit(syscall.RLIMIT_NOFILE, &rLimit)
        if err != nil {
            log.Printf("Unable to increase number of open files limit", err)
        }
    }
}

func main() {
    setRlimit()
    s := &http.Server{
        Addr:         ":59059",
        ReadTimeout:  7 * time.Second,
        WriteTimeout: 7 * time.Second,
    }
    http.HandleFunc("/http", MonitorHttp)

    log.Fatal(s.ListenAndServe())
}

【问题讨论】:

  • 为什么要为每个呼叫构建整个客户端、拨号器、传输等堆栈?为什么不对所有事情都使用一个客户端呢?客户端做连接池和回收等。
  • 您的请求没有超时,因此任何挂起的请求都会留下一个打开的连接。您还禁用了 TCP keepalive,因此可能永远不会检测到断开的连接。
  • 代码中的其他可能问题:您的错误处理不正确,并且为了稍微更改错误字符串而非常冗长。您正在发送 Connection: close 无缘无故的第二次。您在服务器端有读/写超时,但无法提前退出处理程序(另一个可能阻止套接字关闭的地方)。您不应该为每个调用创建新的传输(或客户端),并使用 DefaultTransport,除非您有理由覆盖它。您在 cmets 中引用的代码示例并不正确,请参阅官方文档。
  • 确实,正如@JimB 所怀疑的那样,'lsof' 为半开套接字打印“无法识别协议”。

标签: sockets go


【解决方案1】:

这里有几点。

无论如何,我无法重现您的行为,can't identify protocol 通常与未正确关闭的套接字相关联。

一些评论者建议您不必在每个处理程序中创建 http 客户端 - 这是真的。只需创建一次即可重复使用。

其次,我不确定您为什么要创建自己的 http.Client 结构以及为什么要禁用 keepalives。你不能和http.Get一起去吗?更简单的代码更容易调试。

第三,不知道为什么要覆盖transport.Dial 函数。即使您必须这样做,文档(针对 Go 1.9.2)也说:

% go doc http.transport.dial
type Transport struct {
    // Dial specifies the dial function for creating unencrypted TCP
    connections.
    //
    // Deprecated: Use DialContext instead, which allows the transport
    // to cancel dials as soon as they are no longer needed.
    // If both are set, DialContext takes priority.
    Dial func(network, addr string) (net.Conn, error)

关于弃用和缺乏拨号重用的评论可能指向您问题的根源。

总而言之,在您的情况下,我会做两件事: * 将客户端创建移至执行一次的代码,或仅使用带有http.Get 的默认客户端 * 我会用覆盖默认传输字段来清理这个东西,如果你必须这样做,那么我会按照建议使用DialContext

祝你好运。

【讨论】:

  • 投反对票的理由是什么?我做了认真的努力来重现和理解问题,然后在此基础上给出反馈。确切地说,downvoter 如何做得更好?
【解决方案2】:

我无法重现该问题。但这是我的 2 美分(没有双关语)

  1. 在一篇文章https://idea.popcount.org/2012-12-09-lsof-cant-identify-protocol/ 中注意到的 SockJS-node 中发现了类似问题,根据 FreeBSD 上观察到的此问题。但问题是“websockets 没有正确清理”
  2. 如果您仍然使用相同的环境,我希望您进行另一项测试。如果可能,发布wireshark日志。只是为了确认网络框架中没有可能导致这种情况的微妙事物。

很抱歉,我无法安装 Go 1.5.1 来重现此问题。 希望这对您有所帮助。

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 2016-11-28
    • 2014-10-04
    • 2011-01-23
    • 2015-08-26
    • 1970-01-01
    • 1970-01-01
    • 2014-12-28
    相关资源
    最近更新 更多