【问题标题】:Convert HTML DOM into a multidimensional array将 HTML DOM 转换为多维数组
【发布时间】:2010-04-09 14:51:52
【问题描述】:

其中键由元素类型表示,值由#foo.bar 表示(间隔并准备好explode())。有没有可能,或者有什么东西存在?

我知道这个问题可能会激起一些愤怒,我希望没有人链接到那个关于解析 HTML 的帖子,但我希望这不是不可能的。感谢您的帮助。

附录:理想情况下,会使用 PHP,因为它是我所知道的唯一脚本语言。

【问题讨论】:

  • 你在说什么编程语言?

标签: php html dom multidimensional-array


【解决方案1】:

感谢大家的帮助:\这个函数会将html正文转换为包含属性、类和id的多维数组。

<?php

function htmlArrayer($raw_html){

    $match_open = '/\<(?!\/)(.+?)\>/';
    $match_closed = '/\<\/(.+?)\>/';
    $match_open_or_closed = '/(\<(\/?[^\>]+)\>)/';
    $match_scripts = '@<script[^>]*?>.*?</script>@si';
    $match_styles = '@<style[^>]*?>.*?</style>@siU';
    $match_element = '/(?<=\<\s*)[a-zA-Z](?=\s+)/';
    $match_comments = '/<!--.*?-->/si';
    $match_class = '/(?<=(class\=")).+?(?=")/';
    $match_id = '/(?<=(id\=")).+?(?=")/';

    $raw_html = preg_replace($match_scripts, '', $raw_html);
    $raw_html = preg_replace($match_styles, '', $raw_html);
    $raw_html = preg_replace($match_comments, '', $raw_html);
    $raw_html = str_replace('>', '> ', $raw_html);
    $raw_html = str_replace('<', ' <', $raw_html);
    $raw_html = str_replace('!--', '!-- ', $raw_html);
    $raw_html = preg_replace('/[ \t\r\n]/', ' ', $raw_html);
    preg_match_all($match_open_or_closed, $raw_html, $matches);
    $matches[2] = checkTags($matches[2]);   
    $html_array = htmlToArray($matches[2], 0);

    return $html_array;

}

function checkTags($htmlArray) {
    $valid_tags_array = array('html', 'body', 'div', 'span', 'applet', 'object', 'iframe', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'blockquote', 'pre', 'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'code', 'del', 'dfn', 'em', 'font', 'img', 'ins', 'kbd', 'q', 's', 'samp', 'small', 'strike', 'strong', 'sub', 'sup', 'tt', 'var', 'b', 'u', 'i', 'center', 'dl', 'dt', 'dd', 'ol', 'ul', 'li','fieldset', 'form', 'label', 'legend', 'table', 'caption', 'tbody', 'tfoot', 'thead', 'tr', 'th', 'td');

    foreach($htmlArray as $key => $element) {
        $notfound = true;
        $element = explode(' ', trim($element));

        foreach($valid_tags_array as $tag) {
            if($tag == $element[0] || '/' . $tag == $element[0]){
                $notfound = false;
                break;
            }
        }

        if($notfound != false){
            $htmlArray[$key] = 'br';
        }
    }

    return $htmlArray;
}

function htmlToArray($untiered_array, $index){
    $untiered_element = explode(' ', $untiered_array[$index]);
    if($untiered_element[0] == 'br'){
        $index++;
        $untiered_element = explode(' ', $untiered_array[$index]);
    }

    $css_string = attrToCSS($untiered_array[$index]);
    $untiered_array[$index] = $untiered_element[0] . ' ' . $css_string;

    $new_array_layer = array($untiered_array[$index]);
    $tier_check = 0;

    // Loops through every remaining element from the $index forward
    for($i = $index + 1; $untiered_array[$i] != '/' . $untiered_element[0] || $tier_check != 0; $i++){
        $one_way_elements = array('br', 'img', 'area', 'base', 'basefront', 'hr', 'input', 'link', 'meta', 'col', 'embed', 'param');
        $element_check = true;
        $next_element_name = explode(' ', $untiered_array[$i]);

        foreach($one_way_elements as $this_element){
            if($this_element == $next_element_name[0]){
                $element_check = false;
                break;
            }
        }

        // if it *is* the self-closing type, create a 1d array for it.
        if($element_check == false) {
            $tier_check++;
            if($tier_check == 1) {
                $untiered_standalone = explode(' ', $untiered_array[$i]);
                $css_string = attrToCSS($untiered_array[$i]);
                $untiered_array[$i] = $untiered_standalone[0] . ' ' . $css_string;

                $new_array_layer[] = array($untiered_array[$i]);
            }
            $tier_check--;
        }

        // If the following element is not preceded by a '/' and is not self-closing, continue 
        if((strpos($untiered_array[$i], '/') != 0 || strpos($untiered_array[$i], '/') === false) && $element_check == true){
            $tier_check++;

            // If the next element is only one tier above this element (as in its direct child), reiterate
            if($tier_check == 1){       
                $new_array_layer[] = htmlToArray($untiered_array, $i);
            }                       
        }

        // If the next element *does* begin with a closing slash
        if(strpos($untiered_array[$i], '/') === 0){
            $tier_check--;
        }
    }

    return $new_array_layer;
}

function attrToCSS($attr_string){

    preg_match_all('/(?<=(class\=")).+?(?=")/', $attr_string, $class_value);
    $class_value_string = $class_value[0][0];

    preg_match_all('/(?<=(id\=")).+?(?=")/', $attr_string, $id_value);
    $id_value_string = $id_value[0][0];

    if($class_value_string != ''){
        $class_value_array = explode(' ', $class_value_string);

        foreach($class_value_array as $index => $class) {
            $class_value_array[$index] = '.' . $class;
        }
        $class_id_string = implode(' ', $class_value_array);
    } 

    if ($id_value_string != '') {
        $class_id_string = '#' . $id_value_string;
    }

    return $class_id_string;
}


?>

【讨论】:

    猜你喜欢
    • 2014-06-28
    • 2020-02-28
    • 1970-01-01
    • 2013-01-15
    • 2023-03-26
    • 2019-06-29
    相关资源
    最近更新 更多