php获取页面并切割页面div内容

亮点：

1、利用php也能实现对页面div的切割处理。这里的做法抛砖引玉，希望读者能够提供更加完美的解决方案。

2、切割处理方法已经封装成一个方法，可以直接引用。

3、顺便加上博客园标签云的截取。//getWebDiv(\'id="taglist"\',\'http://www.cnblogs.com/Zjmainstay/tag/\');

View Code

<?php
    header("Content-type: text/html; charset=utf-8"); 
    function getWebDiv($div_id,$url=false,$data=false){
        if($url !== false){
            $data = file_get_contents( $url );
        }
        $charset_pos = stripos($data,\'charset\');
        if($charset_pos) {
            if(stripos($data,\'charset=utf-8\',$charset_pos)) {
                $data = iconv(\'utf-8\',\'utf-8\',$data);
            }else if(stripos($data,\'charset=gb2312\',$charset_pos)) {
                $data = iconv(\'gb2312\',\'utf-8\',$data);
            }else if(stripos($data,\'charset=gbk\',$charset_pos)) {
                $data = iconv(\'gbk\',\'utf-8\',$data);
            }
        }
        
        preg_match_all(\'/<div/i\',$data,$pre_matches,PREG_OFFSET_CAPTURE);    //获取所有div前缀
        preg_match_all(\'/<\/div/i\',$data,$suf_matches,PREG_OFFSET_CAPTURE); //获取所有div后缀
        $hit = strpos($data,$div_id);
        if($hit == -1) return false;    //未命中
        $divs = array();    //合并所有div
        foreach($pre_matches[0] as $index=>$pre_div){
            $divs[(int)$pre_div[1]] = \'p\';
            $divs[(int)$suf_matches[0][$index][1]] = \'s\';    
        }
        
        //对div进行排序
        $sort = array_keys($divs);
        asort($sort);
        
        $count = count($pre_matches[0]);
        foreach($pre_matches[0] as $index=>$pre_div){
            //<div $hit <div+1    时div被命中
            if(($pre_matches[0][$index][1] < $hit) && ($hit < $pre_matches[0][$index+1][1])){
                $deeper = 0;
                //弹出被命中div前的div
                while(array_shift($sort) != $pre_matches[0][$index][1] && ($count--)) continue;
                //对剩余div进行匹配，若下一个为前缀，则向下一层，$deeper加1，
                //否则后退一层，$deeper减1，$deeper为0则命中匹配，计算div长度
                foreach($sort as $key){
                    if($divs[$key] == \'p\') $deeper++;
                    else if($deeper == 0) {
                        $length = $key-$pre_matches[0][$index][1];
                        break;
                    }else {
                        $deeper--;
                    }
                }
                $hitDivString = substr($data,$pre_matches[0][$index][1],$length).\'</div>\';
                break;
            }
        }
        return $hitDivString;
    }
    
    echo getWebDiv(\'id="taglist"\',\'http://www.cnblogs.com/Zjmainstay/tag/\');

//End_php

考虑到id符号问题，id="u"由用户自己填写。

声明：此段php只针对带 id div内容的读取。

——————————————————————————完善：匹配任意可闭合带id标签————————————————————————————————————————————

View Code

 1 <?php
 2     header("Content-type: text/html; charset=utf-8"); 
 3     function getWebTag($tag_id,$url=false,$tag=\'div\',$data=false){
 4         if($url !== false){
 5             $data = file_get_contents( $url );
 6         }
 7         $charset_pos = stripos($data,\'charset\');
 8         if($charset_pos) {
 9             if(stripos($data,\'charset=utf-8\',$charset_pos)) {
10                 $data = iconv(\'utf-8\',\'utf-8\',$data);
11             }else if(stripos($data,\'charset=gb2312\',$charset_pos)) {
12                 $data = iconv(\'gb2312\',\'utf-8\',$data);
13             }else if(stripos($data,\'charset=gbk\',$charset_pos)) {
14                 $data = iconv(\'gbk\',\'utf-8\',$data);
15             }
16         }
17         
18         preg_match_all(\'/<\'.$tag.\'/i\',$data,$pre_matches,PREG_OFFSET_CAPTURE);    //获取所有div前缀
19         preg_match_all(\'/<\/\'.$tag.\'/i\',$data,$suf_matches,PREG_OFFSET_CAPTURE); //获取所有div后缀
20         $hit = strpos($data,$tag_id);
21         if($hit == -1) return false;    //未命中
22         $divs = array();    //合并所有div
23         foreach($pre_matches[0] as $index=>$pre_div){
24             $divs[(int)$pre_div[1]] = \'p\';
25             $divs[(int)$suf_matches[0][$index][1]] = \'s\';    
26         }
27         
28         //对div进行排序
29         $sort = array_keys($divs);
30         asort($sort);
31         
32         $count = count($pre_matches[0]);
33         foreach($pre_matches[0] as $index=>$pre_div){
34             //<div $hit <div+1    时div被命中
35             if(($pre_matches[0][$index][1] < $hit) && ($hit < $pre_matches[0][$index+1][1])){
36                 $deeper = 0;
37                 //弹出被命中div前的div
38                 while(array_shift($sort) != $pre_matches[0][$index][1] && ($count--)) continue;
39                 //对剩余div进行匹配，若下一个为前缀，则向下一层，$deeper加1，
40                 //否则后退一层，$deeper减1，$deeper为0则命中匹配，计算div长度
41                 foreach($sort as $key){
42                     if($divs[$key] == \'p\') $deeper++;
43                     else if($deeper == 0) {
44                         $length = $key-$pre_matches[0][$index][1];
45                         break;
46                     }else {
47                         $deeper--;
48                     }
49                 }
50                 $hitDivString = substr($data,$pre_matches[0][$index][1],$length).\'</\'.$tag.\'>\';
51                 break;
52             }
53         }
54         return $hitDivString;
55     }
56     
57     echo getWebTag(\'id="nav"\',\'http://mail.163.com/html/mail_intro/\',\'ul\');
58     echo getWebTag(\'id="homeBanners"\',\'http://mail.163.com/html/mail_intro/\');
59     echo getWebTag(\'id="performance"\',\'http://mail.163.com/html/mail_intro/\',\'section\');
60 
61 //End_php

修复：stripos($data,\'charset=utf-8\',$charset_pos) 加入charset=，避免有些gb2312格式的网页中包含utf-8造成错误。或者用户可以自行修改函数传入一个确定的charset参数。

演示地址：parseDiv