1. 背景
网站上爬取了部分关于影视的百度知道QA,为了后续提高影视的搜索效果,需要基于百度知道QA的答案抽取相关的影视信息。
2. 流程
目前已有基础的媒资视频库信息,基于媒资视频库中的视频名称,构建分词字典,结合使用AC双数组,然后针对百度的QA进行分词。针对分词后的结果,可以结合视频热度与评分进行筛选。
3. 代码实现
(1) 基于文本(格式为每行一条视频名称),结合AC双数组,构建分词
package com.test.model.act; import com.google.common.collect.Lists; import com.test.util.IOUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static com.test.model.act.AhoCorasickDoubleArrayTrie.*; import java.io.*; import java.util.Iterator; import java.util.List; import java.util.TreeMap; /** * @author test * @date 2018/11/1 */ public class Act { private static Logger logger = LoggerFactory.getLogger(Act.class); private static Act instance = null; private static String path = "act"; private AhoCorasickDoubleArrayTrie<Resource> act = new AhoCorasickDoubleArrayTrie<>(); public static Act getInstance() throws IOException, ClassNotFoundException { if(null == instance){ instance = new Act(); } return instance; } public Act() throws IOException, ClassNotFoundException { this.initTrie(); } /** * AC自动机初始化 * @throws IOException * @throws ClassNotFoundException */ private void initTrie() throws IOException, ClassNotFoundException { if(new File(path).exists()){ FileInputStream fis = new FileInputStream(path); ObjectInputStream ois = new ObjectInputStream(fis); long curTime = System.currentTimeMillis()/1000; act.load(ois); logger.info("load act cost: " + (System.currentTimeMillis()/1000 - curTime)); }else{ TreeMap<String, Resource> treeMap = new TreeMap<>(); List<String> datas = IOUtil.getPreprocessedData("videoNames.txt"); for(String data : datas){ data = data.trim(); if(!treeMap.containsKey(data)){ Resource resource = new Resource(data); treeMap.put(data, resource); } } long curTime = System.currentTimeMillis()/1000; act.build(treeMap); logger.info("build act cost: " + (System.currentTimeMillis()/1000 - curTime)); curTime = System.currentTimeMillis()/1000; act.save(new ObjectOutputStream(new FileOutputStream(path))); logger.info("save act cost: " + (System.currentTimeMillis()/1000 - curTime)); } } /** * AC字段树最长匹配分词 * @param queryText * @return */ public List<Term<Resource>> parse(String queryText){ final List<Term<Resource>> terms = Lists.newArrayList(); act.parseText(queryText, new AhoCorasickDoubleArrayTrie.IHit<Resource>(){ @Override public void hit(int begin, int end, Resource value) { Iterator<Term<Resource>> iterator = terms.iterator(); int length = end - begin; boolean isSubStr = false; while (iterator.hasNext()) { Term<Resource> current = iterator.next(); // 相交且小于当前,移除 if (current.end >= begin && length > current.getLength()) { iterator.remove(); } if(current.getValue().getValue().contains(value.getValue())){ isSubStr = true; } } if(!isSubStr){ terms.add(new Term<Resource>(begin, end, value)); } } }); return terms; } public List<String> neatSplitResult(List<Term<Resource>> terms){ List<String> dupResults = Lists.newArrayList(); for(int j = terms.size() - 1; j > 0; j --){ String termJ = terms.get(j).getValue().getValue(); if(!terms.get(j-1).getValue().getValue().endsWith(termJ)){ dupResults.add(termJ); } } dupResults.add(terms.get(0).getValue().getValue()); List<String> results = Lists.newArrayList(); for(int j = dupResults.size() - 1; j >= 0; j--){ results.add(dupResults.get(j)); } return results; } }