研究一下扫描网站,获取指定有用信息
A.需求:
获取<IMG><A>标签中的内容
B.具体处理过程
1.获取目标网站的内容
2. 利用php7 自带的DOMDocument
C. 实现代码
WebGrab.php
<?php namespace Application\test; class WebGrab { protected $content=false; // 获得内容, 利用DomDocument类 public function getContent($url){ if (!$this->content) { $url = \'http://\' . $url; $this->content = new \DOMDocument(\'1.0\', \'utf-8\'); $this->content->preserveWhiteSpace = false; // @符号用于过滤掉配置错误的网页所生成的警告 @$this->content->loadHTMLFile($url); } return $this->content; } // 获取指定的标签 以及标签的内容 public function getTags($url,$tag){ $count=0; $result=array(); $elements=$this->getContent($url)->getElementsByTagName($tag); foreach ($elements as $node){ $result[$count][\'value\']=trim(preg_replace(\'/\s+/\',\' \',$node->nodeValue)); if ($node->hasAttributes()){ foreach ($node->attributes as $name=>$attr){ $result[$count][\'attributes\'][$name]=$attr->value; } } $count++; } return $result; } // 获取指定的属性 public function getAttribute($url,$attr,$domain=null){ $result=array(); $element=$this->getContent($url)->getElementsByTagName(\'*\'); foreach ($element as $node){ if ($node->hasAttribute($attr)){ $value=$node->getAttribute($attr); if ($domain){ if (stripos($value,$domain)!==false){ $result[]=trim($value); } }else{ $result[]=trim($value); } } } return $result; } }
执行代码:
web_grab.php
<?php hen you read this code, good luck for you. */ define(\'DEFAULT_URL\',\'\'); define(\'DEFAULT_TAG\',\'img\'); // 自己封装的自动加载类, require __DIR__.\'/../Autoload/Loader.php\'; Application\Autoload\Loader::init(__DIR__.\'/../..\'); $test=new Application\test\WebGrab(); $url=strip_tags($_GET[\'url\']??DEFAULT_URL); $tag=strip_tags($_GET[\'tag\']??DEFAULT_TAG); echo json_encode( [ \'message\'=>\'获取成功\', \'attribute\'=>$test->getAttribute($url,\'src\'), \'tag\'=>$test->getTags($url,$tag) ]);
执行结果