数据挖掘：提取百度知道QA中的影视信息

1. 背景

网站上爬取了部分关于影视的百度知道QA，为了后续提高影视的搜索效果，需要基于百度知道QA的答案抽取相关的影视信息。

2. 流程

目前已有基础的媒资视频库信息，基于媒资视频库中的视频名称，构建分词字典，结合使用AC双数组，然后针对百度的QA进行分词。针对分词后的结果，可以结合视频热度与评分进行筛选。

3. 代码实现

(1) 基于文本(格式为每行一条视频名称)，结合AC双数组，构建分词

package com.test.model.act;

import com.google.common.collect.Lists;
import com.test.util.IOUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static com.test.model.act.AhoCorasickDoubleArrayTrie.*;
import java.io.*;
import java.util.Iterator;
import java.util.List;
import java.util.TreeMap;

/**
 * @author test
 * @date 2018/11/1
 */
public class Act {

    private static Logger logger = LoggerFactory.getLogger(Act.class);

    private static Act instance = null;
    private static String path = "act";
    private AhoCorasickDoubleArrayTrie<Resource> act = new AhoCorasickDoubleArrayTrie<>();

    public static Act getInstance() throws IOException, ClassNotFoundException {
        if(null == instance){
            instance = new Act();
        }
        return instance;
    }

    public Act() throws IOException, ClassNotFoundException {
        this.initTrie();
    }

    /**
     * AC自动机初始化
     * @throws IOException
     * @throws ClassNotFoundException
     */
    private void initTrie() throws IOException, ClassNotFoundException {
        if(new File(path).exists()){
            FileInputStream fis = new FileInputStream(path);
            ObjectInputStream ois = new ObjectInputStream(fis);
            long curTime = System.currentTimeMillis()/1000;
            act.load(ois);
            logger.info("load act cost: " + (System.currentTimeMillis()/1000 - curTime));
        }else{
            TreeMap<String, Resource> treeMap = new TreeMap<>();
            List<String> datas = IOUtil.getPreprocessedData("videoNames.txt");
            for(String data : datas){
                data = data.trim();
                if(!treeMap.containsKey(data)){
                    Resource resource = new Resource(data);
                    treeMap.put(data, resource);
                }
            }
            long curTime = System.currentTimeMillis()/1000;
            act.build(treeMap);
            logger.info("build act cost: " + (System.currentTimeMillis()/1000 - curTime));

            curTime = System.currentTimeMillis()/1000;
            act.save(new ObjectOutputStream(new FileOutputStream(path)));
            logger.info("save act cost: " + (System.currentTimeMillis()/1000 - curTime));
        }
    }

    /**
     * AC字段树最长匹配分词
     * @param queryText
     * @return
     */
    public List<Term<Resource>> parse(String queryText){
        final List<Term<Resource>> terms = Lists.newArrayList();
        act.parseText(queryText, new AhoCorasickDoubleArrayTrie.IHit<Resource>(){
            @Override
            public void hit(int begin, int end, Resource value) {
                Iterator<Term<Resource>> iterator = terms.iterator();
                int length = end - begin;
                boolean isSubStr = false;
                while (iterator.hasNext()) {
                    Term<Resource> current = iterator.next();
                    // 相交且小于当前，移除
                    if (current.end >= begin && length > current.getLength()) {
                        iterator.remove();
                    }
                    if(current.getValue().getValue().contains(value.getValue())){
                        isSubStr = true;
                    }
                }
                if(!isSubStr){
                    terms.add(new Term<Resource>(begin, end, value));
                }
            }
        });
        return terms;
    }

    public List<String> neatSplitResult(List<Term<Resource>> terms){
        List<String> dupResults = Lists.newArrayList();
        for(int j = terms.size() - 1; j > 0; j --){
            String termJ = terms.get(j).getValue().getValue();
            if(!terms.get(j-1).getValue().getValue().endsWith(termJ)){
                dupResults.add(termJ);
            }
        }
        dupResults.add(terms.get(0).getValue().getValue());

        List<String> results = Lists.newArrayList();
        for(int j = dupResults.size() - 1; j >= 0; j--){
            results.add(dupResults.get(j));
        }
        return results;
    }
}

View Code