【问题标题】:C how to skip BOM when checking if x is at the start of a fileC在检查x是否在文件开头时如何跳过BOM
【发布时间】:2018-07-25 11:54:20
【问题描述】:

在 C 数组/字符串中,如果文件具有 BOM,我如何正确检测文件开头是否有内容,因为有时 BOM 占用 1 个字符,有时 BOM 占用 3 个字符,以及其他有时 BOM 不存在,导致 x 的实际位置并不总是从索引 0 开始

大多数时候是这个(十六进制)“ef bb bf” 例如:

ef bb bf 23 21 2f 62 69 6e 2f 62 61 73 68 0a 61 20 26 26 20 62 0a 67 20 : ...#!/bin/bash.a && b.g 

会是这样吗?

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>

struct BOM {
    int is_BOM;
    int length;
    int type;
    char * type_as_string;
    char * BOM;
}

int matches(char * BOM_, char * string_, int length_) {
    char * b = BOM_+1;
    for(int i = 0; i < length_; i++) {
        if (string_[i] == b[i]) matches = 1;
        else {
            matches = 0;
            break;
        }
    }
    return matches;
}

#define ifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_) if (matches(BOM_, string_, length_)) { \
    bom_struct.is_BOM = is_BOM_; \
    bom_struct.length = length_; \
    bom_struct.type = type_; \
    bom_struct.type_as_string = type_as_string_; \
    bom_struct.BOM = BOM_+1 /* remove the ^ at the start */ ; \
}

#define elifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_) else ifbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_, string_)

#define elbom(bom_struct, is_BOM_, length_, type_, type_as_string_, BOM_) else { \
    bom_struct.is_BOM = is_BOM_; \
    bom_struct.length = length_; \
    bom_struct.type = type_; \
    bom_struct.type_as_string = type_as_string_; \
    bom_struct.BOM = BOM_; \
}

#define cat 0
#define hex 1
#define both 2
#define json 3

int mode;

void __hexdump(unsigned char *buffer, unsigned long index, unsigned long width)
{
    unsigned long i;
    if (mode == both || mode == hex) {
    for (i = 0; i < index; i++)
        printf("%02x ", buffer[i]);
    }
    if (mode == both) {
    for (unsigned long spacer = index; spacer < width; spacer++)
        printf("\t");
    printf(": ");
    }
    if (mode == cat || mode == both || mode == json) {
    for (i = 0; i < index; i++)
    {
        if (buffer[i] < 32 || buffer[i] >= 127)
            printf(".");
        else
            printf("%c", buffer[i]);
    }
    }
    printf("\n");
}

int __hexdump_string(char *infile, unsigned long start, unsigned long stop, unsigned long width)
{
    char ch;
    unsigned long f_index = 0;
    unsigned long bb_index = 0;
    unsigned char *byte_buffer = malloc(width);
    if (byte_buffer == NULL)
    {
        printf("Could not allocate memory for byte_buffer\n");
        return -1;
    }
    while (*infile)
    {
        ch = *infile;
        if ((f_index >= start) && (f_index <= stop))
        {
            byte_buffer[bb_index] = ch;
            bb_index++;
        }
        if (bb_index >= width)
        {
            __hexdump(byte_buffer, bb_index, width);
            bb_index = 0;
        }
        f_index++;
        infile++;
    }
    if (bb_index)
        __hexdump(byte_buffer, bb_index, width);
    free(byte_buffer);
    return 0;
}

#define builtin__BOM_print(bom_struct) { \
    printf("%s.is_BOM = %s\n%s.length = %d\n%s.type = %d\n%s.type_as_string = %s\n%s.BOM = ", #bom_struct, bom_struct.is_BOM?"yes":"no", #bom_struct, bom_struct.length, #bom_struct, bom_struct.type, #bom_struct,bom_struct.type_as_string, #bom_struct); \
    mode = both; \
    __hexdump_string(bom_struct.BOM, 0, bom_struct.length, 5); \
}


struct BOM builtin__BOM_get(char * string) {
    struct BOM bom;
    ifbom(bom, true, 3, 1, "UTF-8", "^\xef\xbb\xbf", string)
    elifbom(bom, true, 2, 2, "UTF-16 (BE)", "^\xfe\xff", string)
    elifbom(bom, true, 2, 3, "UTF-16 (LE)", "^\xff\xfe", string)
    elifbom(bom, true, 4, 4, "UTF-32 (BE)", "^\x00\x00\xfe\xff", string)
    elifbom(bom, true, 4, 5, "UTF-32 (LE)", "^\xff\xfe\x00\x00", string)
    elifbom(bom, true, 5, 6, "UTF-7", "^\x2b\x2f\x76\x38\x3d", string)
    elifbom(bom, true, 4, 7, "UTF-7", "^\x2b\x2f\x76\x38", string)
    elifbom(bom, true, 4, 8, "UTF-7", "^\x2b\x2f\x76\x39", string)
    elifbom(bom, true, 4, 9, "UTF-7", "^\x2b\x2f\x76\x2b", string)
    elifbom(bom, true, 4, 10, "UTF-7", "^\x2b\x2f\x76\x2f", string)
    elifbom(bom, true, 3, 11, "UTF-1", "^\xf7\x64\x4c", string)
    elifbom(bom, true, 4, 12, "UTF-EBCDIC", "^\xdd\x73\x66\x73", string)
    elifbom(bom, true, 3, 13, "SCSU", "^\x0e\xfe\xff", string)
    elifbom(bom, true, 3, 14, "BOCU-1", "^\xfb\xee\x28", string)
    elifbom(bom, true, 4, 15, "GB-18030", "^\x84\x31\x95\x33", string)
    elbom(bom, false, 0, 0, "Not present", "Not present")

    return (struct BOM) bom;
}

int main()
{
    struct BOM t = builtin__BOM_get("test");
    builtin__BOM_print(t);
    return 0;
}

【问题讨论】:

标签: c byte-order-mark


【解决方案1】:

您应该阅读第一个字符以了解 BOM 是否存在。

  • 如果前 4 个字符是 FF FE 00 00 : little endian UTF-32
  • 否则,如果 2 个第一个字符是 FF FE:little endian UTF-16
  • 否则,如果前 4 个字符是 00 00 FE FF:大端 UTF-32
  • 否则,如果 2 个第一个字符是 FE FF:big endian UTF-16
  • 否则,如果 3 个第一个字符是 EF BB BF : UTF-8
  • 等等……

根据 BOM 长度,您知道实际文件数据从哪个索引开始。

您可以在维基百科页面上找到更完整的 BOM 列表:https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding

【讨论】:

  • "if 4 first chars are FF FE 00 00" 是有问题的。它可以是 BOM-little endian UTF-32 或 BOM-little endian UTF-16,后跟 UTF-16_null_character。后续数据的进一步处理通常可以辨别出哪些。
  • BOM 用于文本文件。你不应该在文本文件中有空字符
  • Unicode 11.0.0 将代码 0 作为字符。 “您不应该在文本文件中包含空字符”是一个普遍的想法,但是您使用什么规范来支持它? C 没有指定。
  • 这不是规范,只是0不是文本字符,当然不能在C规范中,不是语言问题。任何文本编辑器都会将包含 0 字符的文件视为二进制文件
  • 有趣。您使用什么文本编辑器将空字符视为二进制而不是文本?我使用 MS word pad 编辑了一个 12 字节的文件“\xFF\xFE\0\0\x41\x0\x42\x0\x43\x0\xA\x0”,它显示和更新就好像它有一点字节序 UTF -16 BOM(不是这个答案建议的 32)和小端 UTF-16 文本,保留 2 字节 null 字符
猜你喜欢
  • 2016-05-12
  • 1970-01-01
  • 1970-01-01
  • 2017-04-24
  • 2015-09-05
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2013-04-16
相关资源
最近更新 更多