【发布时间】:2021-12-20 12:21:06
【问题描述】:
在生产环境中发现一个不直观的现象:
- 我有一个服务器来接受客户端请求。
- 服务器每次最多读取 16K 的数据(即
read(fd, buf, 16 * 1024))。
我发现read()系统调用通过strace -c返回16KB时延迟为2µs,而read() systemc调用时延迟为5µs返回10B。 (注意:服务器每次 read() 总是读取 16KB,区别在于客户端。一个客户端一次写入 16KB,另一个写入 10B。)
我觉得这很反直觉,read() 或write() 16KB 应该比 10B 慢,但是我对 linux 内核和网络堆栈了解不多。
我写了一个最小的可重现示例:
服务器端代码:
#include <assert.h>
#include <unistd.h>
#include <netinet/in.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/types.h>
const int BUFSIZE = 16 * 1024;
int main() {
// make a socket
int fd = socket(AF_INET, SOCK_STREAM, 0);
assert(fd > 0);
// bind to 0.0.0.0:7731
struct sockaddr_in sock;
memset(&sock, 0, sizeof(sock));
sock.sin_family = AF_INET;
sock.sin_addr.s_addr = INADDR_ANY;
sock.sin_port = htons(7731);
bind(fd, (struct sockaddr*)&sock, sizeof(sock));
assert(listen(fd, 128) == 0);
// accept a peer
struct sockaddr_in peer_sock;
socklen_t socklen;
int peer_fd = accept(fd, (struct sockaddr*)&peer_sock, &socklen);
assert(peer_fd > 0);
// read & write
char buf[BUFSIZE];
int nread = 0;
while (nread = read(peer_fd, buf, sizeof(buf)), nread > 0) {
int sum = 0;
for (int i = 0; i < nread; ++i) {
sum += buf[i];
}
assert(write(peer_fd, buf, nread) == nread);
}
return 0;
}
客户端代码(由 golang 编写):
package main
import (
"log"
"net"
)
func main() {
conn, err := net.Dial("tcp", "127.0.0.1:7731")
if err != nil {
log.Fatal(err)
}
// 16KB or 10B buf
buf := make([]byte, 16*1024)
for i := 0; i < 1000000; i++ {
nwrite := 0
for nwrite != len(buf) {
n, err := conn.Write(buf)
if err != nil {
log.Fatal(err)
}
nwrite += n
}
nread := 0
readBuf := make([]byte, len(buf))
for nread != len(buf) {
n, err := conn.Read(readBuf)
if err != nil {
log.Fatal(err)
}
nread += n
}
log.Print(i)
}
}
当客户端一次写入10B(注意:服务器每次读取16KB。即read(fd, buf, 16384)),服务器的strace和strace -c -side 输出是:
$ strace ./server
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
$ strace -c ./a.out
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
65.00 0.233234 9 24596 write
35.00 0.125599 5 24598 read
0.00 0.000000 0 2 open
0.00 0.000000 0 2 close
0.00 0.000000 0 2 fstat
0.00 0.000000 0 5 mmap
0.00 0.000000 0 4 mprotect
0.00 0.000000 0 1 munmap
0.00 0.000000 0 1 brk
0.00 0.000000 0 3 3 access
0.00 0.000000 0 1 socket
0.00 0.000000 0 1 accept
0.00 0.000000 0 1 bind
0.00 0.000000 0 1 listen
0.00 0.000000 0 1 execve
0.00 0.000000 0 1 arch_prctl
------ ----------- ----------- --------- --------- ----------------
100.00 0.358833 49220 3 total
客户端一次写入16KB时,服务器端输出的strace和strace -c为:
$ strace ./server
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
$ strace -c ./server
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
61.26 0.063949 3 20692 write
38.74 0.040441 2 20694 read
0.00 0.000000 0 2 open
0.00 0.000000 0 2 close
0.00 0.000000 0 2 fstat
0.00 0.000000 0 5 mmap
0.00 0.000000 0 4 mprotect
0.00 0.000000 0 1 munmap
0.00 0.000000 0 1 brk
0.00 0.000000 0 3 3 access
0.00 0.000000 0 1 socket
0.00 0.000000 0 1 accept
0.00 0.000000 0 1 bind
0.00 0.000000 0 1 listen
0.00 0.000000 0 1 execve
0.00 0.000000 0 1 arch_prctl
------ ----------- ----------- --------- --------- ----------------
100.00 0.104390 41412 3 total
【问题讨论】:
-
我猜它正在等待更多数据到达
-
不确定,但如果我不得不猜测,我的猜测是,由于您的缓冲区独立于操作系统使用的数据通信缓冲区,因此缓冲区越大意味着您最终发出的系统调用越少从操作系统的缓冲区中读取。我认为系统调用很昂贵。
-
@AlanBirtles 我认为如果读取 16KB 应该等待更长的时间
-
@Galik 是的,系统调用很昂贵。但我认为 16KB 系统调用会比 10B 系统调用慢,但事实恰恰相反
-
我的第一个猜测是,这是由于 Nagel 的算法等待合并小数据包。分布是怎样的?中位数与平均值不同是因为有几个阻塞读取吗?
标签: c++ c linux linux-kernel network-programming