0基础入门预训练的词向量

在我参考的代码中，有直接加载的，一般是进行预处理后进行自定义自己的词库，然后利用已有预训练好的词向量进行编码：

1.glove 模型：

def process_questions(args):
    \'\'\' Encode question tokens\'\'\'
    print(\'Loading data\')

#加载你的数据集

    with open(args.annotation_file, \'r\') as dataset_file:
        instances = json.load(dataset_file)

    # Either create the vocab or load it from disk

#分情况进行编码，如果是train的话需要制作词汇表，val和test使用train的词汇，为了泛化能力
if args.mode in [\'train\']:
        print(\'Building vocab\')
        answer_cnt = {}
#进行词频统计
        for instance in instances:
            answer = instance[\'answer\']
            answer_cnt[answer] = answer_cnt.get(answer, 0) + 1 # dic字典独有的得到词和方式，如果得不到用0填充

        answer_token_to_idx = {\'<UNK0>\': 0, \'<UNK1>\': 1}
        answer_counter = Counter(answer_cnt)
        frequent_answers = answer_counter.most_common(args.answer_top) #选择最多的词汇进行编码，去除低频词，避免过多词训练起来很慢
        total_ans = sum(item[1] for item in answer_counter.items())
        total_freq_ans = sum(item[1] for item in frequent_answers)
        print("Number of unique answers:", len(answer_counter))
        print("Total number of answers:", total_ans)
        print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans))
        
#制作答案的词汇表
       for token, cnt in Counter(answer_cnt).most_common(args.answer_top):
            answer_token_to_idx[token] = len(answer_token_to_idx)
        print(\'Get answer_token_to_idx, num: %d\' % len(answer_token_to_idx))
      #制作问题的词汇表，期间进行了 分词操作 
        question_token_to_idx = {\'<NULL>\': 0, \'<UNK>\': 1}
        for i, instance in enumerate(instances):
            question = instance[\'question\'].lower()[:-1]
            for token in nltk.word_tokenize(question):
                if token not in question_token_to_idx:
                    question_token_to_idx[token] = len(question_token_to_idx)
        print(\'Get question_token_to_idx\')
        print(len(question_token_to_idx))

        vocab = {
            \'question_token_to_idx\': question_token_to_idx,
            \'answer_token_to_idx\': answer_token_to_idx,
            \'question_answer_token_to_idx\': {\'<NULL>\': 0, \'<UNK>\': 1}
        }
       #词汇表的存储，以便val和test的时候使用
        print(\'Write into %s\' % args.vocab_json.format(args.dataset, args.dataset))
        with open(args.vocab_json.format(args.dataset, args.dataset), \'w\') as f:
            json.dump(vocab, f, indent=4)
    else:
        print(\'Loading vocab\')
        with open(args.vocab_json.format(args.dataset, args.dataset), \'r\') as f:
            vocab = json.load(f)

    # Encode all questions
    print(\'Encoding data\')
    questions_encoded = []
    questions_len = []
    question_ids = []
    video_ids_tbw = []
    video_names_tbw = []
    all_answers = []
    for idx, instance in enumerate(instances):
        question = instance[\'question\'].lower()[:-1]
        question_tokens = nltk.word_tokenize(question)
 #上面一行是进行token处理，下面一行是进行把token转换成数字
        question_encoded = utils.encode(question_tokens, vocab[\'question_token_to_idx\'], allow_unk=True)
        questions_encoded.append(question_encoded)
        questions_len.append(len(question_encoded))
        question_ids.append(idx)
        im_name = instance[\'video_id\']
        video_ids_tbw.append(im_name)
        video_names_tbw.append(im_name)

        if instance[\'answer\'] in vocab[\'answer_token_to_idx\']:
            answer = vocab[\'answer_token_to_idx\'][instance[\'answer\']]
        elif args.mode in [\'train\']:
            answer = 0
        elif args.mode in [\'val\', \'test\']:
            answer = 1

        all_answers.append(answer)
    max_question_length = max(len(x) for x in questions_encoded)
    for qe in questions_encoded:
#进行填充，让question每一个都是同样大小，相当于pad操作
        while len(qe) < max_question_length:
            qe.append(vocab[\'question_token_to_idx\'][\'<NULL>\'])

    questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
    questions_len = np.asarray(questions_len, dtype=np.int32)
    print(questions_encoded.shape)

    glove_matrix = None
    if args.mode == \'train\':
 #这里利用我们自定义的表，自制预训练的词向量，让它按照我们词汇表的顺序。
        token_itow = {i: w for w, i in vocab[\'question_token_to_idx\'].items()}
        print("Load glove from %s" % args.glove_pt)
        glove = pickle.load(open(args.glove_pt, \'rb\'))
        dim_word = glove[\'the\'].shape[0]
        glove_matrix = []
        for i in range(len(token_itow)):
            vector = glove.get(token_itow[i], np.zeros((dim_word,))) #没有的词汇一律变成 0向量
            glove_matrix.append(vector) #按照我们自己自定义的词汇表进行编码，所以是这样子进行append
        glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
        print(glove_matrix.shape)

    print(\'Writing\', args.output_pt.format(args.dataset, args.dataset, args.mode))
    obj = {
        \'questions\': questions_encoded,
        \'questions_len\': questions_len,
        \'question_id\': question_ids,
        \'video_ids\': np.asarray(video_ids_tbw),
        \'video_names\': np.array(video_names_tbw),
        \'answers\': all_answers,
        \'glove\': glove_matrix,
    }
    with open(args.output_pt.format(args.dataset, args.dataset, args.mode), \'wb\') as f:
        pickle.dump(obj, f)

2. Elmo 模型

self.elmo = Elmo(args.options_file, args.weight_file, 1, dropout=0.5,vocab_to_cache=vocab)

下载了得到 weight和options ，但是它是和glove不一样的。它没有300d向量，只有处理的参数得到模型。

参考链接：可以去看看，挺好的

https://zhuanlan.zhihu.com/p/55320266

处理oov词：摘自

Word Embedding 如何处理未登录词？

作者：刘晗
链接：https://www.zhihu.com/question/308543084/answer/611451582
来源：知乎
著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。

大概有这么几种方法吧：

最原始的做法是用 UNK 标签表示所有未登录词，但是 UNK 的 embedding 一般不会用零向量。

第二种方法

我觉得最容易想到的方法，使用 sub-word level embedding。比如大名鼎鼎的 fastText，通过 character n-gram 组合出 word embedding，不存在 OOV 的问题。官网能找到有很多语种的 pre-trained embedding，自己训练起来也不难，网上也有很多 blog、tutorial 可参考。而且感觉常用度不比 word2vec、GloVe 差。

论文：Enriching Word Vectors with Subword Information Bojanowsk et al.
代码：facebookresearch/fastText

第三种方法（重点！）

也是我看到题目时脑海中涌现的第一个想法，来源于论文 "Mimicking Word Embeddings using Subword RNNs." by Yuval Pinter, Robert Guthrie, and Jacob Eisenstein. 本文发表在 EMNLP 2017 上，专门解决 word embedding 的 OOV 问题。论文提出了 MIMICK 模型，在已有 embedding 上根据 subword 信息学习一个从拼写到 embedding 的 function 来为 OOV words 生成 embedding。作者还在 Universal Dependencies (UD) 语料库的 23 种语言上测试了 MIMICK 在词性标注任务 (POS tagging) 中的准确率，并得到了不错的结果。而且 MIMICK 在中文上有着突出的表现。原因是 UD 的中文语料库包含了 >12000 的 token，可供 MIMICK 充分地学习出 embedding 的方法。另外 MIMICK 在低资源语言 (low-resource languages) 处理上有着重要的作用。通常情况下，用来训练的语料库资源有限，而 MIMICK 可以发挥自身优势，充分利用现有数据生成 word embedding ，完成下游任务。

论文：Mimicking Word Embeddings using Subword RNNs
代码：yuvalpinter/Mimick