C在检查x是否在文件开头时如何跳过BOM答案

【问题标题】：C how to skip BOM when checking if x is at the start of a fileC在检查x是否在文件开头时如何跳过BOM
【发布时间】：2018-07-25 11:54:20
【问题描述】：

在 C 数组/字符串中，如果文件具有 BOM，我如何正确检测文件开头是否有内容，因为有时 BOM 占用 1 个字符，有时 BOM 占用 3 个字符，以及其他有时 BOM 不存在，导致 x 的实际位置并不总是从索引 0 开始

大多数时候是这个（十六进制）“ef bb bf” 例如：

ef bb bf 23 21 2f 62 69 6e 2f 62 61 73 68 0a 61 20 26 26 20 62 0a 67 20 : ...#!/bin/bash.a && b.g

会是这样吗？

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>

struct BOM {
    int is_BOM;
    int length;
    int type;
    char * type_as_string;
    char * BOM;
}

int matches(char * BOM_, char * string_, int length_) {
    char * b = BOM_+1;
    for(int i = 0; i < length_; i++) {
        if (string_[i] == b[i]) matches = 1;
        else {
            matches = 0;
            break;
        }
    }
    return matches;
}

#define ifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_) if (matches(BOM_, string_, length_)) { \
    bom_struct.is_BOM = is_BOM_; \
    bom_struct.length = length_; \
    bom_struct.type = type_; \
    bom_struct.type_as_string = type_as_string_; \
    bom_struct.BOM = BOM_+1 /* remove the ^ at the start */ ; \
}

#define elifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_) else ifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_)

#define elbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_) else { \
    bom_struct.is_BOM = is_BOM_; \
    bom_struct.length = length_; \
    bom_struct.type = type_; \
    bom_struct.type_as_string = type_as_string_; \
    bom_struct.BOM = BOM_; \
}

#define cat 0
#define hex 1
#define both 2
#define json 3

int mode;

void __hexdump(unsigned char *buffer, unsigned long index, unsigned long width)
{
    unsigned long i;
    if (mode == both || mode == hex) {
    for (i = 0; i < index; i++)
        printf("%02x ", buffer[i]);
    }
    if (mode == both) {
    for (unsigned long spacer = index; spacer < width; spacer++)
        printf("\t");
    printf(": ");
    }
    if (mode == cat || mode == both || mode == json) {
    for (i = 0; i < index; i++)
    {
        if (buffer[i] < 32 || buffer[i] >= 127)
            printf(".");
        else
            printf("%c", buffer[i]);
    }
    }
    printf("\n");
}

int __hexdump_string(char *infile, unsigned long start, unsigned long stop, unsigned long width)
{
    char ch;
    unsigned long f_index = 0;
    unsigned long bb_index = 0;
    unsigned char *byte_buffer = malloc(width);
    if (byte_buffer == NULL)
    {
        printf("Could not allocate memory for byte_buffer\n");
        return -1;
    }
    while (*infile)
    {
        ch = *infile;
        if ((f_index >= start) && (f_index <= stop))
        {
            byte_buffer[bb_index] = ch;
            bb_index++;
        }
        if (bb_index >= width)
        {
            __hexdump(byte_buffer, bb_index, width);
            bb_index = 0;
        }
        f_index++;
        infile++;
    }
    if (bb_index)
        __hexdump(byte_buffer, bb_index, width);
    free(byte_buffer);
    return 0;
}

#define builtin__BOM_print(bom_struct) { \
    printf("%s.is_BOM = %s\n%s.length = %d\n%s.type = %d\n%s.type_as_string = %s\n%s.BOM = ", #bom_struct, bom_struct.is_BOM?"yes":"no", #bom_struct, bom_struct.length, #bom_struct, bom_struct.type, #bom_struct,bom_struct.type_as_string, #bom_struct); \
    mode = both; \
    __hexdump_string(bom_struct.BOM, 0, bom_struct.length, 5); \
}


struct BOM builtin__BOM_get(char * string) {
    struct BOM bom;
    ifbom(bom, true, 3, 1, "UTF-8", "^\xef\xbb\xbf", string)
    elifbom(bom, true, 2, 2, "UTF-16 (BE)", "^\xfe\xff", string)
    elifbom(bom, true, 2, 3, "UTF-16 (LE)", "^\xff\xfe", string)
    elifbom(bom, true, 4, 4, "UTF-32 (BE)", "^\x00\x00\xfe\xff", string)
    elifbom(bom, true, 4, 5, "UTF-32 (LE)", "^\xff\xfe\x00\x00", string)
    elifbom(bom, true, 5, 6, "UTF-7", "^\x2b\x2f\x76\x38\x3d", string)
    elifbom(bom, true, 4, 7, "UTF-7", "^\x2b\x2f\x76\x38", string)
    elifbom(bom, true, 4, 8, "UTF-7", "^\x2b\x2f\x76\x39", string)
    elifbom(bom, true, 4, 9, "UTF-7", "^\x2b\x2f\x76\x2b", string)
    elifbom(bom, true, 4, 10, "UTF-7", "^\x2b\x2f\x76\x2f", string)
    elifbom(bom, true, 3, 11, "UTF-1", "^\xf7\x64\x4c", string)
    elifbom(bom, true, 4, 12, "UTF-EBCDIC", "^\xdd\x73\x66\x73", string)
    elifbom(bom, true, 3, 13, "SCSU", "^\x0e\xfe\xff", string)
    elifbom(bom, true, 3, 14, "BOCU-1", "^\xfb\xee\x28", string)
    elifbom(bom, true, 4, 15, "GB-18030", "^\x84\x31\x95\x33", string)
    elbom(bom, false, 0, 0, "Not present", "Not present")

    return (struct BOM) bom;
}

int main()
{
    struct BOM t = builtin__BOM_get("test");
    builtin__BOM_print(t);
    return 0;
}

【问题讨论】：

文件内容总是从字节 0 开始。你需要知道这些内容代表什么。
There aren't that many cases to test。所以.. 测试它们。

标签： c byte-order-mark

【解决方案1】：

您应该阅读第一个字符以了解 BOM 是否存在。

如果前 4 个字符是 FF FE 00 00 : little endian UTF-32
否则，如果 2 个第一个字符是 FF FE：little endian UTF-16
否则，如果前 4 个字符是 00 00 FE FF：大端 UTF-32
否则，如果 2 个第一个字符是 FE FF：big endian UTF-16
否则，如果 3 个第一个字符是 EF BB BF : UTF-8
等等……

根据 BOM 长度，您知道实际文件数据从哪个索引开始。

您可以在维基百科页面上找到更完整的 BOM 列表：https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding

【讨论】：

"if 4 first chars are FF FE 00 00" 是有问题的。它可以是 BOM-little endian UTF-32 或 BOM-little endian UTF-16，后跟 UTF-16_null_character。后续数据的进一步处理通常可以辨别出哪些。
BOM 用于文本文件。你不应该在文本文件中有空字符
Unicode 11.0.0 将代码 0 作为字符。 “您不应该在文本文件中包含空字符”是一个普遍的想法，但是您使用什么规范来支持它？ C 没有指定。
这不是规范，只是0不是文本字符，当然不能在C规范中，不是语言问题。任何文本编辑器都会将包含 0 字符的文件视为二进制文件
有趣。您使用什么文本编辑器将空字符视为二进制而不是文本？我使用 MS word pad 编辑了一个 12 字节的文件“\xFF\xFE\0\0\x41\x0\x42\x0\x43\x0\xA\x0”，它显示和更新就好像它有一点字节序 UTF -16 BOM（不是这个答案建议的 32）和小端 UTF-16 文本，保留 2 字节 null 字符。