做采集的时候,可以使用file_get_contents()去获取网页源代码,但是使用file_get_contents采集,速度慢,而且超时时间,不好控制。如果采集的页面不存在,需要等待的时间很长。一般来说,curl的速度最快,其次是socket,最后是file_get_contents。
现在跟大家分享一个很强大的采集类,会根据你的服务器当前的配置,自动选择最快的方式。已经封装了curl和socket,file_get_contents
用法很简单:
1、采用get方法请求
Http::doGet(网址);//超市时间可忽略,默认是5秒
Http::doGet(网址,超时时间);
如echo Http::doGet(\'http://www.baidu.com\');
2、采用post方法请求
Http::doPost(网址,数据,超时时间);
如
$url=\'http://www.canphp.com/test.php\';
$data[\'name\']=\'单骑\';
$data[\'email\']=\'admin@canphp.com\';
Http::doPost($url,$data,10);
test.php页面接收数据
$_POST[\'name\'];
$_POST[\'email\'];
这个http类不仅可以用来采集,还有一个很强大的作用,模拟php异步多进程。
比如有index.php和a.php, b.php, c.php
在index.php中
Http::doGet(\'http://www.canphp.com/a.php\',1);
Http::doGet(\'http://www.canphp.com/b.php\',1);
Http::doGet(\'http://www.canphp.com/c.php\',1);
a.php, b.php, c.php程序分别在头部加上ignore_user_abort(true);
那么就可以实现多进程了。
原理:
通过curl或socket发送请求给a.php, b.php, c.php,由于超时时间比较短,只是触发了a.php, b.php, c.php三个页面,不需要等待数据返回,连接已中断,但是a.php, b.php, c.php程序中加上了ignore_user_abort(true);忽略客户端连接,还会继续执行。
<?php // 数据采集,doGET,doPOST,文件下载, class Http { static public $way = 0; // 手动设置访问方式 static public function setWay($way) { self :: $way = intval($way); } static public function getSupport() { // 如果指定访问方式,则按指定的方式去访问 if (isset(self :: $way) && in_array(self :: $way, array(1, 2, 3))) return self :: $way; // 自动获取最佳访问方式 if (function_exists(\'curl_init\')) { // curl方式 return 1; } else if (function_exists(\'fsockopen\')) { // socket return 2; } else if (function_exists(\'file_get_contents\')) { // php系统函数file_get_contents return 3; } else { return 0; } } // 通过get方式获取数据 static public function doGet($url, $timeout = 5, $header = "") { if (empty($url) || empty($timeout)) return false; if (!preg_match(\'/^(http|https)/is\', $url)) $url = "http://" . $url; $code = self :: getSupport(); switch ($code) { case 1:return self :: curlGet($url, $timeout, $header); break; case 2:return self :: socketGet($url, $timeout, $header); break; case 3:return self :: phpGet($url, $timeout, $header); break; default:return false; } } // 通过POST方式发送数据 static public function doPost($url, $post_data = array(), $timeout = 5, $header = "") { if (empty($url) || empty($post_data) || empty($timeout)) return false; if (!preg_match(\'/^(http|https)/is\', $url)) $url = "http://" . $url; $code = self :: getSupport(); switch ($code) { case 1:return self :: curlPost($url, $post_data, $timeout, $header); break; case 2:return self :: socketPost($url, $post_data, $timeout, $header); break; case 3:return self :: phpPost($url, $post_data, $timeout, $header); break; default:return false; } } // 通过curl get数据 static public function curlGet($url, $timeout = 5, $header = "") { $header = empty($header)?self :: defaultHeader():$header; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_HTTPHEADER, array($header)); //模拟的header头 $result = curl_exec($ch); curl_close($ch); return $result; } // 通过curl post数据 static public function curlPost($url, $post_data = array(), $timeout = 5, $header = "") { $header = empty($header)?\'\':$header; $post_string = http_build_query($post_data); $ch = curl_init(); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, $post_string); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_HTTPHEADER, array($header)); //模拟的header头 $result = curl_exec($ch); curl_close($ch); return $result; } // 通过socket get数据 static public function socketGet($url, $timeout = 5, $header = "") { $header = empty($header)?self :: defaultHeader():$header; $url2 = parse_url($url); $url2["path"] = isset($url2["path"])? $url2["path"]: "/" ; $url2["port"] = isset($url2["port"])? $url2["port"] : 80; $url2["query"] = isset($url2["query"])? "?" . $url2["query"] : ""; $host_ip = @gethostbyname($url2["host"]); if (($fsock = fsockopen($host_ip, $url2[\'port\'], $errno, $errstr, $timeout)) < 0) { return false; } $request = $url2["path"] . $url2["query"]; $in = "GET " . $request . " HTTP/1.0\r\n"; if (false === strpos($header, "Host:")) { $in .= "Host: " . $url2["host"] . "\r\n"; } $in .= $header; $in .= "Connection: Close\r\n\r\n"; if (!@fwrite($fsock, $in, strlen($in))) { @fclose($fsock); return false; } return self :: GetHttpContent($fsock); } // 通过socket post数据 static public function socketPost($url, $post_data = array(), $timeout = 5, $header = "") { $header = empty($header)?self :: defaultHeader():$header; $post_string = http_build_query($post_data); $url2 = parse_url($url); $url2["path"] = ($url2["path"] == "" ? "/" : $url2["path"]); $url2["port"] = ($url2["port"] == "" ? 80 : $url2["port"]); $host_ip = @gethostbyname($url2["host"]); $fsock_timeout = $timeout; //超时时间 if (($fsock = fsockopen($host_ip, $url2[\'port\'], $errno, $errstr, $fsock_timeout)) < 0) { return false; } $request = $url2["path"] . ($url2["query"] ? "?" . $url2["query"] : ""); $in = "POST " . $request . " HTTP/1.0\r\n"; $in .= "Host: " . $url2["host"] . "\r\n"; $in .= $header; $in .= "Content-type: application/x-www-form-urlencoded\r\n"; $in .= "Content-Length: " . strlen($post_string) . "\r\n"; $in .= "Connection: Close\r\n\r\n"; $in .= $post_string . "\r\n\r\n"; unset($post_string); if (!@fwrite($fsock, $in, strlen($in))) { @fclose($fsock); return false; } return self :: GetHttpContent($fsock); } // 通过file_get_contents函数get数据 static public function phpGet($url, $timeout = 5, $header = "") { $header = empty($header)?self :: defaultHeader():$header; $opts = array( \'http\' => array(\'protocol_version\' => \'1.0\', // http协议版本(若不指定php5.2系默认为http1.0) \'method\' => "GET", // 获取方式 \'timeout\' => $timeout , // 超时时间 \'header\' => $header) ); $context = stream_context_create($opts); return @file_get_contents($url, false, $context); } // 通过file_get_contents 函数post数据 static public function phpPost($url, $post_data = array(), $timeout = 5, $header = "") { $header = empty($header)?self :: defaultHeader():$header; $post_string = http_build_query($post_data); $header .= "Content-length: " . strlen($post_string); $opts = array(\'http\' => array( \'protocol_version\' => \'1.0\', // http协议版本(若不指定php5.2系默认为http1.0) \'method\' => "POST", // 获取方式 \'timeout\' => $timeout , // 超时时间 \'header\' => $header, \'content\' => $post_string) ); $context = stream_context_create($opts); return @file_get_contents($url, false, $context); } // 默认模拟的header头 static private function defaultHeader() { $header = "User-Agent:Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.12) Gecko/20101026 Firefox/3.6.12\r\n"; $header .= "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"; $header .= "Accept-language: zh-cn,zh;q=0.5\r\n"; $header .= "Accept-Charset: GB2312,utf-8;q=0.7,*;q=0.7\r\n"; return $header; } // 获取通过socket方式get和post页面的返回数据 static private function GetHttpContent($fsock = null) { $out = null; while ($buff = @fgets($fsock, 2048)) { $out .= $buff; } fclose($fsock); $pos = strpos($out, "\r\n\r\n"); $head = substr($out, 0, $pos); //http head $status = substr($head, 0, strpos($head, "\r\n")); //http status line $body = substr($out, $pos + 4, strlen($out) - ($pos + 4)); //page body if (preg_match("/^HTTP\/\d\.\d\s([\d]+)\s.*$/", $status, $matches)) { if (intval($matches[1]) / 100 == 2) { return $body; } else { return false; } } else { return false; } } /** * 功能: 下载文件 * 参数:$filename 下载文件路径 * $showname 下载显示的文件名 * $expire 下载内容浏览器缓存时间 */ static public function download($filename, $showname = \'\', $expire = 1800) { if (file_exists($filename) && is_file($filename)) { $length = filesize($filename); } else { die(\'下载文件不存在!\'); } $type = mime_content_type($filename); // 发送Http Header信息 开始下载 header("Pragma: public"); header("Cache-control: max-age=" . $expire); // header(\'Cache-Control: no-store, no-cache, must-revalidate\'); header("Expires: " . gmdate("D, d M Y H:i:s", time() + $expire) . "GMT"); header("Last-Modified: " . gmdate("D, d M Y H:i:s", time()) . "GMT"); header("Content-Disposition: attachment; filename=" . $showname); header("Content-Length: " . $length); header("Content-type: " . $type); header(\'Content-Encoding: none\'); header("Content-Transfer-Encoding: binary"); readfile($filename); return true; } } if (!function_exists (\'mime_content_type\')) { /** * +---------------------------------------------------------- * 获取文件的mime_content类型 * +---------------------------------------------------------- * * @return string +---------------------------------------------------------- */ function mime_content_type($filename) { static $contentType = array( \'ai\' => \'application/postscript\', \'aif\' => \'audio/x-aiff\', \'aifc\' => \'audio/x-aiff\', \'aiff\' => \'audio/x-aiff\', \'asc\' => \'application/pgp\', // changed by skwashd - was text/plain \'asf\' => \'video/x-ms-asf\', \'asx\' => \'video/x-ms-asf\', \'au\' => \'audio/basic\', \'avi\' => \'video/x-msvideo\', \'bcpio\' => \'application/x-bcpio\', \'bin\' => \'application/octet-stream\', \'bmp\' => \'image/bmp\', \'c\' => \'text/plain\', // or \'text/x-csrc\', //added by skwashd \'cc\' => \'text/plain\', // or \'text/x-c++src\', //added by skwashd \'cs\' => \'text/plain\', // added by skwashd - for C# src \'cpp\' => \'text/x-c++src\', // added by skwashd \'cxx\' => \'text/x-c++src\', // added by skwashd \'cdf\' => \'application/x-netcdf\', \'class\' => \'application/octet-stream\', // secure but application/java-class is correct \'com\' => \'application/octet-stream\', // added by skwashd \'cpio\' => \'application/x-cpio\', \'cpt\' => \'application/mac-compactpro\', \'csh\' => \'application/x-csh\', \'css\' => \'text/css\', \'csv\' => \'text/comma-separated-values\', // added by skwashd \'dcr\' => \'application/x-director\', \'diff\' => \'text/diff\', \'dir\' => \'application/x-director\', \'dll\' => \'application/octet-stream\', \'dms\' => \'application/octet-stream\', \'doc\' => \'application/msword\', \'dot\' => \'application/msword\', // added by skwashd \'dvi\' => \'application/x-dvi\', \'dxr\' => \'application/x-director\', \'eps\' => \'application/postscript\', \'etx\' => \'text/x-setext\', \'exe\' => \'application/octet-stream\', \'ez\' => \'application/andrew-inset\', \'gif\' => \'image/gif\', \'gtar\' => \'application/x-gtar\', \'gz\' => \'application/x-gzip\', \'h\' => \'text/plain\', // or \'text/x-chdr\',//added by skwashd \'h++\' => \'text/plain\', // or \'text/x-c++hdr\', //added by skwashd \'hh\' => \'text/plain\', // or \'text/x-c++hdr\', //added by skwashd \'hpp\' => \'text/plain\', // or \'text/x-c++hdr\', //added by skwashd \'hxx\' => \'text/plain\', // or \'text/x-c++hdr\', //added by skwashd \'hdf\' => \'application/x-hdf\', \'hqx\' => \'application/mac-binhex40\', \'htm\' => \'text/html\', \'html\' => \'text/html\', \'ice\' => \'x-conference/x-cooltalk\', \'ics\' => \'text/calendar\', \'ief\' => \'image/ief\', \'ifb\' => \'text/calendar\', \'iges\' => \'model/iges\', \'igs\' => \'model/iges\', \'jar\' => \'application/x-jar\', // added by skwashd - alternative mime type \'java\' => \'text/x-java-source\', // added by skwashd \'jpe\' => \'image/jpeg\', \'jpeg\' => \'image/jpeg\', \'jpg\' => \'image/jpeg\', \'js\' => \'application/x-javascript\', \'kar\' => \'audio/midi\', \'latex\' => \'application/x-latex\', \'lha\' => \'application/octet-stream\', \'log\' => \'text/plain\', \'lzh\' => \'application/octet-stream\', \'m3u\' => \'audio/x-mpegurl\', \'man\' => \'application/x-troff-man\', \'me\' => \'application/x-troff-me\', \'mesh\' => \'model/mesh\', \'mid\' => \'audio/midi\', \'midi\' => \'audio/midi\', \'mif\' => \'application/vnd.mif\', \'mov\' => \'video/quicktime\', \'movie\' => \'video/x-sgi-movie\', \'mp2\' => \'audio/mpeg\', \'mp3\' => \'audio/mpeg\', \'mpe\' => \'video/mpeg\', \'mpeg\' => \'video/mpeg\', \'mpg\' => \'video/mpeg\', \'mpga\' => \'audio/mpeg\', \'ms\' => \'application/x-troff-ms\', \'msh\' => \'model/mesh\', \'mxu\' => \'video/vnd.mpegurl\', \'nc\' => \'application/x-netcdf\', \'oda\' => \'application/oda\', \'patch\' => \'text/diff\', \'pbm\' => \'image/x-portable-bitmap\', \'pdb\' => \'chemical/x-pdb\', \'pdf\' => \'application/pdf\', \'pgm\' => \'image/x-portable-graymap\', \'pgn\' => \'application/x-chess-pgn\', \'pgp\' => \'application/pgp\', // added by skwashd \'php\' => \'application/x-httpd-php\', \'php3\' => \'application/x-httpd-php3\', \'pl\' => \'application/x-perl\', \'pm\' => \'application/x-perl\', \'png\' => \'image/png\', \'pnm\' => \'image/x-portable-anymap\', \'po\' => \'text/plain\', \'ppm\' => \'image/x-portable-pixmap\', \'ppt\' => \'application/vnd.ms-powerpoint\', \'ps\' => \'application/postscript\', \'qt\' => \'video/quicktime\', \'ra\' => \'audio/x-realaudio\', \'rar\' => \'application/octet-stream\', \'ram\' => \'audio/x-pn-realaudio\', \'ras\' => \'image/x-cmu-raster\', \'rgb\' => \'image/x-rgb\', \'rm\' => \'audio/x-pn-realaudio\', \'roff\' => \'application/x-troff\', \'rpm\' => \'audio/x-pn-realaudio-plugin\', \'rtf\' => \'text/rtf\', \'rtx\' => \'text/richtext\', \'sgm\' => \'text/sgml\', \'sgml\' => \'text/sgml\', \'sh\' => \'application/x-sh\', \'shar\' => \'application/x-shar\', \'shtml\' => \'text/html\', \'silo\' => \'model/mesh\', \'sit\' => \'application/x-stuffit\', \'skd\' => \'application/x-koan\', \'skm\' => \'application/x-koan\', \'skp\' => \'application/x-koan\', \'skt\' => \'application/x-koan\', \'smi\' => \'application/smil\', \'smil\' => \'application/smil\', \'snd\' => \'audio/basic\', \'so\' => \'application/octet-stream\', \'spl\' => \'application/x-futuresplash\', \'src\' => \'application/x-wais-source\', \'stc\' => \'application/vnd.sun.xml.calc.template\', \'std\' => \'application/vnd.sun.xml.draw.template\', \'sti\' => \'application/vnd.sun.xml.impress.template\', \'stw\' => \'application/vnd.sun.xml.writer.template\', \'sv4cpio\' => \'application/x-sv4cpio\', \'sv4crc\' => \'application/x-sv4crc\', \'swf\' => \'application/x-shockwave-flash\', \'sxc\' => \'application/vnd.sun.xml.calc\', \'sxd\' => \'application/vnd.sun.xml.draw\', \'sxg\' => \'application/vnd.sun.xml.writer.global\', \'sxi\' => \'application/vnd.sun.xml.impress\', \'sxm\' => \'application/vnd.sun.xml.math\', \'sxw\' => \'application/vnd.sun.xml.writer\', \'t\' => \'application/x-troff\', \'tar\' => \'application/x-tar\', \'tcl\' => \'application/x-tcl\', \'tex\' => \'application/x-tex\', \'texi\' => \'application/x-texinfo\', \'texinfo\' => \'application/x-texinfo\', \'tgz\' => \'application/x-gtar\', \'tif\' => \'image/tiff\', \'tiff\' => \'image/tiff\', \'tr\' => \'application/x-troff\', \'tsv\' => \'text/tab-separated-values\', \'txt\' => \'text/plain\', \'ustar\' => \'application/x-ustar\', \'vbs\' => \'text/plain\', // added by skwashd - for obvious reasons \'vcd\' => \'application/x-cdlink\', \'vcf\' => \'text/x-vcard\', \'vcs\' => \'text/calendar\', \'vfb\' => \'text/calendar\', \'vrml\' => \'model/vrml\', \'vsd\' => \'application/vnd.visio\', \'wav\' => \'audio/x-wav\', \'wax\' => \'audio/x-ms-wax\', \'wbmp\' => \'image/vnd.wap.wbmp\', \'wbxml\' => \'application/vnd.wap.wbxml\', \'wm\' => \'video/x-ms-wm\', \'wma\' => \'audio/x-ms-wma\', \'wmd\' => \'application/x-ms-wmd\', \'wml\' => \'text/vnd.wap.wml\', \'wmlc\' => \'application/vnd.wap.wmlc\', \'wmls\' => \'text/vnd.wap.wmlscript\', \'wmlsc\' => \'application/vnd.wap.wmlscriptc\', \'wmv\' => \'video/x-ms-wmv\', \'wmx\' => \'video/x-ms-wmx\', \'wmz\' => \'application/x-ms-wmz\', \'wrl\' => \'model/vrml\', \'wvx\' => \'video/x-ms-wvx\', \'xbm\' => \'image/x-xbitmap\', \'xht\' => \'application/xhtml+xml\', \'xhtml\' => \'application/xhtml+xml\', \'xls\' => \'application/vnd.ms-excel\', \'xlt\' => \'application/vnd.ms-excel\', \'xml\' => \'application/xml\', \'xpm\' => \'image/x-xpixmap\', \'xsl\' => \'text/xml\', \'xwd\' => \'image/x-xwindowdump\', \'xyz\' => \'chemical/x-xyz\', \'z\' => \'application/x-compress\', \'zip\' => \'application/zip\', ); $type = strtolower(substr(strrchr($filename, \'.\'), 1)); if (isset($contentType[$type])) { $mime = $contentType[$type]; } else { $mime = \'application/octet-stream\'; } return $mime; } } if (!function_exists(\'image_type_to_extension\')) { function image_type_to_extension($imagetype) { if (empty($imagetype)) return false; switch ($imagetype) { case IMAGETYPE_GIF : return \'.gif\'; case IMAGETYPE_JPEG : return \'.jpg\'; case IMAGETYPE_PNG : return \'.png\'; case IMAGETYPE_SWF : return \'.swf\'; case IMAGETYPE_PSD : return \'.psd\'; case IMAGETYPE_BMP : return \'.bmp\'; case IMAGETYPE_TIFF_II : return \'.tiff\'; case IMAGETYPE_TIFF_MM : return \'.tiff\'; case IMAGETYPE_JPC : return \'.jpc\'; case IMAGETYPE_JP2 : return \'.jp2\'; case IMAGETYPE_JPX : return \'.jpf\'; case IMAGETYPE_JB2 : return \'.jb2\'; case IMAGETYPE_SWC : return \'.swc\'; case IMAGETYPE_IFF : return \'.aiff\'; case IMAGETYPE_WBMP : return \'.wbmp\'; case IMAGETYPE_XBM : return \'.xbm\'; default : return false; } } } ?>
方法:download($filename, $showname=\'\',$expire=1800)
说明:用于下载文件
参数:
•$filename,包含路径的文件名
•$showname,下载显示的文件名,需要自行转成gbk编码,如果带空格,需要自行替换成其他字符
•$expire,下载内容浏览器缓存时间
使用方法:
1 |
$showname=\'最新资料.zip\';
|
2 |
$showname=auto_charset($showname,\'utf-8\',\'gbk\');//utf-8编码转成gbk编码
|
3 |
Http::download(\'upload/123.zip\',$showname);
|
方法:doGet($url,$timeout=5,$header=\'\')
说明:采用get方法请求页面,会自动使用最快的访问方式,获取数据
参数:
•$url,网址
•$timeout,超时时间
•$header,http请求头,用于发送cookie等信息
使用方法:
1 |
echo Http::doGet(\'http://www.baidu.com\');
|
方法:doPost($url,$data,$timeout=5,$header=\'\')
说明:采用post方法请求页面,会自动使用最快的访问方式,获取数据
参数:
•$url,网址
•$data,待发送的数据,类型数组。
•$timeout,超时时间
•$header,http请求头,用于发送cookie等信息
使用方法:
1 |
$url=\'http://www.canphp.com/test.php\';
|
2 |
$data[\'name\']=\'单骑\';
|
3 |
$data[\'email\']=\'admin@canphp.com\';
|
4 |
Http::doPost($url,$data,10);
|
5 |
6 |
//test.php页面接收数据 |
7 |
$_POST[\'name\'];
|
8 |
$_POST[\'email\'];
|
方法:setWay($way)
说明:手动设置doGet()和doPost()访问方式
参数:
•$way:参数可以1或2或3
•参数1时:采用curl
•参数2时:采用socket
•参数3时:采用file_get_contents()函数模拟
•若不设置访问方式,会自动获取当前环境的支持方式,选择最佳的方式去访问,优先度curl > socket > file_get_contents
使用方法:
1 |
Http::setWay(3); |
2 |
echo Http::doGet(\'http://www.baidu.com\');//将采用file_get_contents()方式获取内容
|