原文出处:http://www.yund.tech/zdetail.html?type=1&id=ee06002e2b83e7677c30aedc52d3429e
作者:jstarseven
现在的网站千奇百怪,什么样格式的都有,需要提取网页中的列表数据,有时候挨个分析处理很头疼,本文是一个页面结构分析的程序,可以分析处理页面大致列表结构。
废话不多说,我也不会说,show me code,code is terrible,so what hahaha。-------jstarseven
1.抽取元素dom结构框架
1 /** 2 * 分析元素dom结构框架 3 * 4 * @param node 5 * @return 6 */ 7 public String filterHtml(Element node) { 8 //去除节点的属性值 9 Document new_node = Jsoup.parse(node.outerHtml()); 10 Elements elements = new_node.getAllElements(); 11 for (Element item : elements) { 12 Attributes attributes = item.attributes(); 13 for (Attribute a : attributes) { 14 if (a.getKey().equals(KeysEnum.attr_scroce)) { 15 item.removeAttr(a.getKey()); 16 continue; 17 } 18 a.setValue(StringUtils.EMPTY); 19 } 20 } 21 //去除注释节点,节点文本内容 22 String str_new = new_node.outerHtml().replaceAll("<!--?(.*?)-->", ""); 23 str_new = str_new.replaceAll("\\s*", ""); 24 str_new = str_new.replaceAll(">(.*?)<", "><"); 25 return str_new; 26 }
2.采用动态规划处理两个字符串相似度
1 public class SimilarDegree { 2 3 public static final double degree = 0.8; 4 5 /** 6 * 采用动态规划的方法解决 7 * 8 * @param source 9 * @param target 10 * @return 11 */ 12 public static int EditDistance(String source, String target) { 13 char[] sources = source.toCharArray(); 14 char[] targets = target.toCharArray(); 15 int sourceLen = sources.length; 16 int targetLen = targets.length; 17 int[][] d = new int[sourceLen + 1][targetLen + 1]; 18 for (int i = 0; i <= sourceLen; i++) { 19 d[i][0] = i; 20 } 21 for (int i = 0; i <= targetLen; i++) { 22 d[0][i] = i; 23 } 24 25 for (int i = 1; i <= sourceLen; i++) { 26 for (int j = 1; j <= targetLen; j++) { 27 if (sources[i - 1] == targets[j - 1]) { 28 d[i][j] = d[i - 1][j - 1]; 29 } else { 30 //插入 31 int insert = d[i][j - 1] + 1; 32 //删除 33 int delete = d[i - 1][j] + 1; 34 //替换 35 int replace = d[i - 1][j - 1] + 1; 36 d[i][j] = Math.min(insert, delete) > Math.min(delete, replace) ? Math.min(delete, replace) : 37 Math.min(insert, delete); 38 } 39 } 40 } 41 return d[sourceLen][targetLen]; 42 } 43 44 public static void main(String[] args) { 45 System.out.println(EditDistance("html > body > ul > li.proiect_item:nth-child(1) > div.item_row.item_row_title > div:nth-child(1) > a", 46 "html > body > ul > li.proiect_item:nth-child(2) > div.item_row.item_row_title > div:nth-child(1) > a")); 47 } 48 49 }