这里有一些工作代码,它们或多或少代表您的代码的工作版本,其中添加了日志记录。日志记录可以更容易地看出它是否在合理地工作。
#include "stderr.h"
#include <assert.h>
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>
#define N 5
static int newProcess(void)
{
int pid = fork();
if (pid == 0)
{
// work - this process goes to sleep on the job!
struct timespec nap = { .tv_sec = rand() % 3, .tv_nsec = rand() % 1000000000 };
nanosleep(&nap, 0);
err_remark("About to do %ld.%9ld seconds work\n", (long)nap.tv_sec, nap.tv_nsec);
int rc = 0;
if (rand() % 100 > 90)
rc = rand() % 255;
err_remark("Work completed - exit status %d\n", rc);
exit(rc);
}
if (pid > 0 && rand() % 100 > 90)
{
kill(pid, rand() % 8 + 1);
errno = EAGAIN;
pid = -1;
}
return pid;
}
static inline int check_child(int pid)
{
#undef SIGNONE
enum { SIGNONE = 0 };
int rc = kill(pid, SIGNONE);
err_remark("PID %d - %s\n", pid, (rc == 0) ? "OK" : "Dead");
return rc;
}
static void process_check(int *npids, int pids[])
{
err_remark("Checking PID list\n");
for (int i = 0; i < *npids; i++)
{
while (check_child(pids[i]) != 0)
{
// Child is presumably dead!
if (*npids > 0)
pids[i] = pids[--*npids];
}
}
}
int main(int argc, char **argv)
{
err_setarg0(argv[0]);
if (argc != 1)
err_usage(" # No arguments allowed");
int child_pid[N] = { 0 };
srand(time(0));
err_setlogopts(ERR_PID | ERR_MICRO);
int processCount = 0;
while (1)
{
if (processCount < N)
{
int pid = newProcess();
if (pid > 0)
{
child_pid[processCount++] = pid;;
err_remark("PID %d started\n", pid);
}
else
{
assert(pid == -1);
int errnum = errno;
err_sysrem("Failed to fork");
process_check(&processCount, child_pid);
if (errnum == EAGAIN)
{
struct timespec nap = { .tv_sec = 0, .tv_nsec = (rand() % 10 + 1) * 1000000 };
nanosleep(&nap, 0); // sleep 1-10 milliseconds (could be too big).
}
}
}
else
{
int corpse;
int status;
if ((corpse = wait(&status)) > 0)
{
int known_pid = 0;
for (int i = 0; i < processCount; i++)
{
if (child_pid[i] == corpse)
{
err_remark("PID %d exit status 0x%.4X\n", corpse, status);
known_pid = 1;
child_pid[i] = child_pid[--processCount];
break;
}
}
if (!known_pid)
err_remark("Unknown PID %d exit status 0x%.4X - ignored\n", corpse, status);
}
}
}
return 0;
}
标头stderr.h 及其配套源代码stderr.c 可以在我的SOQ 存储库的libsoq folder 的GitHub 上找到。它提供了一个方便且可配置的日志服务。
请注意,测试代码会伪造一些失败,并杀死一些孩子等。您将删除该生产代码。您可能会保留大部分日志记录,特别是如果孩子们通常一次工作很多秒,而不是像本例中那样只工作几秒钟。
一些示例输出:
$ ./mon61
mon61: 2017-12-01 09:48:03.636756 - pid=74353: PID 74354 started
mon61: 2017-12-01 09:48:03.637568 - pid=74353: PID 74355 started
mon61: 2017-12-01 09:48:03.637724 - pid=74353: PID 74356 started
mon61: 2017-12-01 09:48:03.637885 - pid=74353: PID 74357 started
mon61: 2017-12-01 09:48:03.638048 - pid=74353: PID 74358 started
mon61: 2017-12-01 09:48:03.747398 - pid=74356: About to do 0.108225168 seconds work
mon61: 2017-12-01 09:48:03.748152 - pid=74356: Work completed - exit status 0
mon61: 2017-12-01 09:48:03.748791 - pid=74353: PID 74356 exit status 0x0000
mon61: 2017-12-01 09:48:03.749046 - pid=74353: PID 74359 started
mon61: 2017-12-01 09:48:04.032219 - pid=74359: About to do 0.281932019 seconds work
mon61: 2017-12-01 09:48:04.032971 - pid=74359: Work completed - exit status 0
mon61: 2017-12-01 09:48:04.033747 - pid=74353: PID 74359 exit status 0x0000
mon61: 2017-12-01 09:48:04.034007 - pid=74353: PID 74361 started
mon61: 2017-12-01 09:48:04.602396 - pid=74355: About to do 0.964067315 seconds work
mon61: 2017-12-01 09:48:04.602951 - pid=74355: Work completed - exit status 0
mon61: 2017-12-01 09:48:04.603596 - pid=74353: PID 74355 exit status 0x0000
mon61: 2017-12-01 09:48:04.603855 - pid=74353: PID 74362 started
mon61: 2017-12-01 09:48:05.419466 - pid=74358: About to do 1.780199743 seconds work
mon61: 2017-12-01 09:48:05.420017 - pid=74358: Work completed - exit status 0
mon61: 2017-12-01 09:48:05.420669 - pid=74353: PID 74358 exit status 0x0000
mon61: 2017-12-01 09:48:05.420923 - pid=74353: PID 74363 started
mon61: 2017-12-01 09:48:05.453929 - pid=74357: About to do 1.814728145 seconds work
mon61: 2017-12-01 09:48:05.454320 - pid=74357: Work completed - exit status 0
mon61: 2017-12-01 09:48:05.454753 - pid=74353: PID 74357 exit status 0x0000
mon61: 2017-12-01 09:48:05.454939 - pid=74353: PID 74364 started
mon61: 2017-12-01 09:48:05.512822 - pid=74354: About to do 1.875699204 seconds work
mon61: 2017-12-01 09:48:05.514094 - pid=74354: Work completed - exit status 0
mon61: 2017-12-01 09:48:05.514349 - pid=74353: PID 74354 exit status 0x0000
mon61: 2017-12-01 09:48:05.514658 - pid=74353: PID 74365 started
mon61: 2017-12-01 09:48:06.004823 - pid=74362: About to do 1.399425773 seconds work
mon61: 2017-12-01 09:48:06.005581 - pid=74362: Work completed - exit status 0
mon61: 2017-12-01 09:48:06.006237 - pid=74353: PID 74362 exit status 0x0000
mon61: 2017-12-01 09:48:06.006523 - pid=74353: Failed to forkerror (35) Resource temporarily unavailable
mon61: 2017-12-01 09:48:06.006562 - pid=74353: Checking PID list
mon61: 2017-12-01 09:48:06.006570 - pid=74353: PID 74364 - OK
mon61: 2017-12-01 09:48:06.006576 - pid=74353: PID 74361 - OK
mon61: 2017-12-01 09:48:06.006582 - pid=74353: PID 74365 - OK
mon61: 2017-12-01 09:48:06.006588 - pid=74353: PID 74363 - OK
mon61: 2017-12-01 09:48:06.013228 - pid=74353: PID 74368 started
mon61: 2017-12-01 09:48:06.013267 - pid=74353: Unknown PID 74366 exit status 0x0006 - ignored
mon61: 2017-12-01 09:48:06.117089 - pid=74361: About to do 2. 82518051 seconds work
mon61: 2017-12-01 09:48:06.117618 - pid=74361: Work completed - exit status 0
mon61: 2017-12-01 09:48:06.118206 - pid=74353: PID 74361 exit status 0x0000
mon61: 2017-12-01 09:48:06.118486 - pid=74353: PID 74369 started
mon61: 2017-12-01 09:48:06.537455 - pid=74363: About to do 1.115086289 seconds work
mon61: 2017-12-01 09:48:06.537967 - pid=74363: Work completed - exit status 0
mon61: 2017-12-01 09:48:06.538610 - pid=74353: PID 74363 exit status 0x0000
mon61: 2017-12-01 09:48:06.538880 - pid=74353: PID 74371 started
mon61: 2017-12-01 09:48:06.682182 - pid=74371: About to do 0.141922802 seconds work
mon61: 2017-12-01 09:48:06.682945 - pid=74371: Work completed - exit status 0
mon61: 2017-12-01 09:48:06.683733 - pid=74353: PID 74371 exit status 0x0000
mon61: 2017-12-01 09:48:06.684007 - pid=74353: PID 74372 started
mon61: 2017-12-01 09:48:06.975561 - pid=74364: About to do 1.519976923 seconds work
mon61: 2017-12-01 09:48:06.976341 - pid=74364: Work completed - exit status 188
mon61: 2017-12-01 09:48:06.976942 - pid=74353: PID 74364 exit status 0xBC00
mon61: 2017-12-01 09:48:06.977225 - pid=74353: PID 74373 started
mon61: 2017-12-01 09:48:07.436814 - pid=74368: About to do 1.422967208 seconds work
mon61: 2017-12-01 09:48:07.437600 - pid=74368: Work completed - exit status 0
mon61: 2017-12-01 09:48:07.438230 - pid=74353: PID 74368 exit status 0x0000
对日志的审查显示有一些“未知 PID”正在死亡的消息。这表明在管理 PID 数组(又名“修复错误”)方面还有一些工作要做。 我以后可能会研究一下。
查看代码,这些是“预期的”。大约有 9% 的几率会创建一个子代但被信号杀死(并且所有这些子代的状态都设置为值 0x0001 到 0x0008,表示被信号杀死)。对于这些进程,newProcess() 的返回状态为-1,这会阻止 PID 进入已知子进程列表,因此当子进程确实死亡并收集状态信息时,PID 为“未知”。换句话说,这是“预期的”行为。可以通过否定返回给调用进程的 PID 来更好地注意此类进程,并编写一条消息,指示该特定子 PID 已创建但因信号而死(可能在子有机会执行任何操作之前,例如报告它是运行)。
与"About to do N.xxxxxxxxxx seconds work" 相关的err_remark() 调用既放错了位置,也有错误的格式。它应该在纳米睡眠之前,而不是之后。它还应该使用%.9d 而不是%9d 来格式化小数时间。两者都很容易修复。
除了让孩子们做真正的工作而不是只是在工作中睡觉之外,还有各种改进要做。该代码可以处理一些信号(例如,中断检查子进程、挂断以重新读取配置文件、终止以杀死子进程并退出)。它可以写入日志文件而不是标准错误。它可以被守护,而不是在前台运行。它可以有选项来控制日志文件目录,也许还有日志文件名。它可以检测是否/何时删除其日志文件并启动一个新文件。等等。
但这给了你一些可以玩的东西。