【发布时间】:2015-12-06 16:26:02
【问题描述】:
我有一个 Go1.5.1 进程/应用程序。当我在该进程上运行/usr/sbin/lsof -p 时,我看到很多“无法识别协议”。
monitor_ 13105 root 101u sock 0,6 0t0 16960100 can't identify protocol
monitor_ 13105 root 102u sock 0,6 0t0 21552427 can't identify protocol
monitor_ 13105 root 103u sock 0,6 0t0 17565091 can't identify protocol
monitor_ 13105 root 104u sock 0,6 0t0 18476870 can't identify protocol
进程状态/限制/fd
[root@Monitor_q ~]# cat /proc/13105/status
Name: monitor_client
State: S (sleeping)
Tgid: 13105
Pid: 13105
PPid: 13104
TracerPid: 0
Uid: 0 0 0 0
Gid: 0 0 0 0
Utrace: 0
FDSize: 16384
Groups:
...
[root@Monitor_q ~]# cat /proc/13105/limits
Limit Soft Limit Hard Limit Units
Max cpu time unlimited unlimited seconds
Max file size unlimited unlimited bytes
Max data size unlimited unlimited bytes
Max stack size 10485760 unlimited bytes
Max core file size 0 unlimited bytes
Max resident set unlimited unlimited bytes
Max processes 3870 3870 processes
Max open files 9999 9999 files
Max locked memory 65536 65536 bytes
Max address space unlimited unlimited bytes
Max file locks unlimited unlimited locks
Max pending signals 3870 3870 signals
Max msgqueue size 819200 819200 bytes
Max nice priority 0 0
Max realtime priority 0 0
Max realtime timeout unlimited unlimited us
[root@Monitor_q ~]# ll /proc/13105/fd/
lrwx------ 1 root root 64 Dec 7 00:15 8382 -> socket:[52023221]
lrwx------ 1 root root 64 Dec 7 00:15 8383 -> socket:[51186627]
lrwx------ 1 root root 64 Dec 7 00:15 8384 -> socket:[51864232]
lrwx------ 1 root root 64 Dec 7 00:15 8385 -> socket:[52435453]
lrwx------ 1 root root 64 Dec 7 00:15 8386 -> socket:[51596071]
lrwx------ 1 root root 64 Dec 7 00:15 8387 -> socket:[52767667]
lrwx------ 1 root root 64 Dec 7 00:15 8388 -> socket:[52090632]
lrwx------ 1 root root 64 Dec 7 00:15 8389 -> socket:[51739068]
lrwx------ 1 root root 64 Dec 7 00:15 839 -> socket:[22963529]
lrwx------ 1 root root 64 Dec 7 00:15 8390 -> socket:[52023223]
lrwx------ 1 root root 64 Dec 7 00:15 8391 -> socket:[52560389]
lrwx------ 1 root root 64 Dec 7 00:15 8392 -> socket:[52402565]
...
但netstat -a中没有类似的输出。
这些套接字是什么?我怎样才能知道它们的作用?
monitor_client.go
package main
import (
"crypto/tls"
"encoding/json"
"fmt"
"log"
"net"
"net/http"
nurl "net/url"
"strconv"
"strings"
"syscall"
"time"
)
type Result struct {
Error string `json:"error"`
HttpStatus int `json:"http_status"`
Stime time.Duration `json:"http_time"`
}
//http://stackoverflow.com/questions/20990332/golang-http-timeout-and-goroutines-accumulation
//http://3.3.3.3/http?host=3.2.4.2&servername=a.test&path=/&port=33&timeout=5&scheme=http
func MonitorHttp(w http.ResponseWriter, r *http.Request) {
var host, servername, path, port, scheme string
var timeout int
u, err := nurl.Parse(r.RequestURI)
if err != nil {
log.Fatal(err)
return
}
if host = u.Query().Get("host"); host == "" {
host = "127.0.0.0"
}
if servername = u.Query().Get("servername"); servername == "" {
servername = "localhost"
}
if path = u.Query().Get("path"); path == "" {
path = "/"
}
if port = u.Query().Get("port"); port == "" {
port = "80"
}
if scheme = u.Query().Get("scheme"); scheme == "" {
scheme = "http"
}
if timeout, _ = strconv.Atoi(u.Query().Get("timeout")); timeout == 0 {
timeout = 5
}
//log.Printf("(host)=%s (servername)=%s (path)=%s (port)=%s (timeout)=%d", host, servername, path, port, timeout)
w.Header().Set("Content-Type", "application/json")
res := httptool(host, port, servername, scheme, path, timeout)
result, _ := json.Marshal(res)
fmt.Fprintf(w, "%s", result)
}
func httptool(ip, port, servername, scheme, path string, timeout int) Result {
var result Result
startTime := time.Now()
host := ip + ":" + port
transport := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
DisableKeepAlives: true,
}
dialer := net.Dialer{
Timeout: time.Duration(timeout) * time.Second,
KeepAlive: 0 * time.Second,
}
transport.Dial = func(network, address string) (net.Conn, error) {
return dialer.Dial(network, address)
}
client := &http.Client{
Transport: transport,
}
rawquery := ""
url := fmt.Sprintf("%s://%s%s%s", scheme, host, path, rawquery)
req, err := http.NewRequest("GET", url, nil)
if err != nil {
result.HttpStatus = -1
errs := strings.Split(err.Error(), ": ")
result.Error = errs[len(errs)-1]
result.Stime = time.Now().Sub(startTime) / time.Millisecond
return result
}
req.Header.Set("User-Agent", "monitor worker")
req.Header.Set("Connection", "close")
req.Host = servername
resp, err := client.Do(req)
//https://github.com/Basiclytics/neverdown/blob/master/check.go
if err != nil {
nerr, ok := err.(*nurl.Error)
if ok {
switch cerr := nerr.Err.(type) {
case *net.OpError:
switch cerr.Err.(type) {
case *net.DNSError:
errs := strings.Split(cerr.Error(), ": ")
result.Error = "dns: " + errs[len(errs)-1]
default:
errs := strings.Split(cerr.Error(), ": ")
result.Error = "server: " + errs[len(errs)-1]
}
default:
switch nerr.Err.Error() {
case "net/http: request canceled while waiting for connection":
errs := strings.Split(cerr.Error(), ": ")
result.Error = "timeout: " + errs[len(errs)-1]
default:
errs := strings.Split(cerr.Error(), ": ")
result.Error = "unknown: " + errs[len(errs)-1]
}
}
} else {
result.Error = "unknown: " + err.Error()
}
result.HttpStatus = -2
result.Stime = time.Now().Sub(startTime) / time.Millisecond
return result
}
resp.Body.Close()
result.HttpStatus = resp.StatusCode
result.Error = "noerror"
result.Stime = time.Now().Sub(startTime) / time.Millisecond //spend time (ms)
return result
}
func setRlimit() {
var rLimit syscall.Rlimit
err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &rLimit)
if err != nil {
log.Printf("Unable to obtain rLimit", err)
}
if rLimit.Cur < rLimit.Max {
rLimit.Max = 9999
rLimit.Cur = 9999
err = syscall.Setrlimit(syscall.RLIMIT_NOFILE, &rLimit)
if err != nil {
log.Printf("Unable to increase number of open files limit", err)
}
}
}
func main() {
setRlimit()
s := &http.Server{
Addr: ":59059",
ReadTimeout: 7 * time.Second,
WriteTimeout: 7 * time.Second,
}
http.HandleFunc("/http", MonitorHttp)
log.Fatal(s.ListenAndServe())
}
【问题讨论】:
-
为什么要为每个呼叫构建整个客户端、拨号器、传输等堆栈?为什么不对所有事情都使用一个客户端呢?客户端做连接池和回收等。
-
您的请求没有超时,因此任何挂起的请求都会留下一个打开的连接。您还禁用了 TCP keepalive,因此可能永远不会检测到断开的连接。
-
代码中的其他可能问题:您的错误处理不正确,并且为了稍微更改错误字符串而非常冗长。您正在发送 Connection: close 无缘无故的第二次。您在服务器端有读/写超时,但无法提前退出处理程序(另一个可能阻止套接字关闭的地方)。您不应该为每个调用创建新的传输(或客户端),并使用 DefaultTransport,除非您有理由覆盖它。您在 cmets 中引用的代码示例并不正确,请参阅官方文档。
-
确实,正如@JimB 所怀疑的那样,'lsof' 为半开套接字打印“无法识别协议”。