a.要采集的源链接:
http://www.prospecbio.com/Recombinant_Proteins/
b.具体要求:
接下来就是采集代码的编写。
对于:b-(1)中,代码如下:
<?php header(\'Content-Type:text/html;charset=UTF-8\'); include \'./phpQuery/phpQuery.php\'; set_time_limit(10000); $url = "http://www.prospecbio.com/Hormones/"; echo "当前的URL:"; echo $url."<br/>"; phpQuery::newDocumentFile($url); $artList = pq(".Body"); // var_dump($artList); $li = \'\'; foreach($artList as $li){ $path = \'\'; $head = \'\'; $head = \'http://www.prospecbio.com\'; $tr = \'\'; $tr = pq($li)->eq(0)->find("table")->eq(0)->find("tr")->eq(0)->find("td")->eq(0)->find("a")->eq(0)->attr(\'href\'); $tr = trim($tr); if($tr != \'\'){ $path = $head.$tr; // var_dump($path); $path .= "\r\n"; file_put_contents(\'Url.txt\',$path,FILE_APPEND); } // exit; } ?>
对于:b-(2)中代码如下:
<?php header(\'Content-Type:text/html;charset=UTF-8\'); include \'./phpQuery/phpQuery.php\'; set_time_limit(10000); $id = isset($_GET[\'id\']) ? intval($_GET[\'id\']) : 1; if($id > 14){ echo "finish!"; exit; } echo "当前 id=".$id; echo "<br/>"; $conn = mysql_connect ( "localhost", "root", "root" ) or die ( "连接服务器失败 !!!" ); $db = mysql_select_db ( "prospect" ) or die ( "选择数据库失败 !!!" ); // var_dump($conn,$db); $url = \'\'; $sql = \'\'; $sql = \'select url from url_a where id =\'.$id; echo "当前sql :".$sql; echo \'<br/>\'; $query = mysql_query($sql); $res = mysql_fetch_assoc($query); $url = trim($res[\'url\']); echo "当前的url:".$url; echo \'<br/>\'; phpQuery::newDocumentFile($url); $artList = pq(".Body"); // var_dump($artList); $li = \'\'; foreach($artList as $k => $li){ $tr = \'\'; $tr = pq($li)->eq(0)->find(\'table\')->eq(0)->find(\'tr\')->eq(1)->find(\'td\')->eq(0)->find(\'a\')->attr(\'href\'); $tr = trim($tr); if($tr !== \'\' and $k > 0){ $head = \'\'; $head = \'http://www.prospecbio.com\'; $path = \'\'; $tr = ltrim($tr,"."); $path = $head.$tr."\r\n"; var_dump($path); echo \'<br/>\'; file_put_contents(\'Url_a.txt\',$path,FILE_APPEND); } } mysql_close($conn); unset($artList); ?> <script> function JumpUrl(){ location.href=\'?id=<?php echo ($id+1);?>\'; } setTimeout(\'JumpUrl()\',0); </script>
对于:b-(3)中,代码如下:
<?php header(\'Content-Type:text/html;charset=gb2312\'); include \'./phpQuery/phpQuery.php\'; set_time_limit(100000); // $id = isset($_GET[\'id\']) ? intval($_GET[\'id\']) : 1290; // $id = isset($_GET[\'id\']) ? intval($_GET[\'id\']) : 2; $id = isset($_GET[\'id\']) ? intval($_GET[\'id\']) : 1; if($id > 63){ echo "finish!"; exit; } echo "当前 id=".$id; echo "<br/>"; $conn = mysql_connect ( "localhost", "root", "root" ) or die ( "连接服务器失败 !!!" ); $db = mysql_select_db ( "prospect" ) or die ( "选择数据库失败 !!!" ); // var_dump($conn,$db); $url = \'\'; $sql = \'\'; $sql = \'select url from url_b where id =\'.$id; echo "当前sql :".$sql; echo \'<br/>\'; $query = mysql_query($sql); $res = mysql_fetch_assoc($query); $url = trim($res[\'url\']); echo "当前的url:".$url; echo \'<br/>\'; //$url_wh = "http://www.prospecbio.com/CAPN2_10_455/"; phpQuery::newDocumentFile($url); //phpQuery::newDocumentFile($url_wh); $arr = array(); ###用于装载产品信息 $arr[\'product_url\'] = \'\'; $arr[\'product_url\'] = $url; ### Product Name $product_name = \'\'; $product_name = pq(\'#PageHeader\')->eq(0)->find(\'span\')->eq(0)->html(); // var_dump($a); $product_name = trim($product_name); echo "产品名称:"; var_dump($product_name); $arr[\'product_name\'] = \'\'; $arr[\'product_name\'] = $product_name; echo \'<br/>\'; #### price $price = \'\'; $price_1 = \'\'; $price_2 = \'\'; $price_3 = \'\'; $price_1 = pq(\'.ProductsColumnPrice\')->find(\'table\')->eq(0)->find(\'tr\')->eq(0)->find(\'td\')->eq(1)->find(\'label\')->eq(0)->html(); $price_1 = trim($price_1); iconv(\'utf-8\',\'gbk\',$price_1); $price_2 = pq(\'.ProductsColumnPrice\')->find(\'table\')->eq(0)->find(\'tr\')->eq(1)->find(\'td\')->eq(1)->find(\'label\')->eq(0)->html(); $price_2 = trim($price_2); iconv(\'utf-8\',\'gbk\',$price_2); $price_3 = pq(\'.ProductsColumnPrice\')->find(\'table\')->eq(0)->find(\'tr\')->eq(2)->find(\'td\')->eq(1)->find(\'label\')->eq(0)->html(); $price_3 = trim($price_3); iconv(\'utf-8\',\'gbk\',$price_3); $price = $price_1."/".$price_2."/".$price_3; iconv(\'utf-8\',\'gbk\',$price); echo "产品价格:"; var_dump($price); $arr[\'price\'] = \'\'; $arr[\'price\'] = $price; echo \'<br/>\'; ########## Catalogue Number $cata_num = \'\'; $cata_num = pq(\'.ItemRowLastCellStyle\')->eq(0)->html(); $cata_num = trim($cata_num); echo "产品 Catalogue Number:"; var_dump($cata_num); $arr[\'cata_num\'] = \'\'; $arr[\'cata_num\'] = $cata_num; echo \'<br/>\'; ############ Source $source = \'\'; $appearance = \'\'; $formulation = \'\'; $stability = \'\'; $purity = \'\'; $amino_acid = \'\'; $solubility = \'\'; $bio_activity = \'\'; $artlist = \'\'; $artlist = pq(\'.ItemRowFirstCellStyle\'); $arr[\'source\'] = \'\'; $arr[\'appearance\'] = \'\'; $arr[\'formulation\'] = \'\'; $arr[\'stability\'] = \'\'; $arr[\'solubility\'] = \'\'; $arr[\'purity\'] = \'\'; $arr[\'amino_acid\'] = \'\'; $arr[\'bio_activity\'] = \'\'; foreach($artlist as $k => $li){ $tr_1 = \'\'; $tr_1 = pq($li)->eq(0)->find(\'span\')->eq(0)->html(); $tr_1 = trim($tr_1); if($tr_1 == "Source"){ $source = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html(); $source = trim($source); $source = strip_tags($source); $source = iconv(\'utf-8\',\'gbk\',$source); echo "产品 Source:"; var_dump($source); $arr[\'source\'] = $source; echo \'<br/>\'; // var_dump($k,$tr_1,$source); } if($tr_1 == "Physical Appearance"){ $appearance = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html(); $appearance = trim($appearance); $appearance = strip_tags($appearance); $appearance = iconv(\'utf-8\',\'gbk\',$appearance); echo "产品 Physical Appearance:"; var_dump($appearance); $arr[\'appearance\'] = $appearance; echo \'<br/>\'; } if($tr_1 == "Formulation"){ $formulation = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html(); $formulation = trim($formulation); $formulation = strip_tags($formulation); $formulation = iconv(\'utf-8\',\'gbk\',$formulation); echo "产品 Formulation:"; var_dump($formulation); $arr[\'formulation\'] = $formulation; echo \'<br/>\'; } if($tr_1 == "Stability"){ $stability = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html(); $stability = trim($stability); $stability = strip_tags($stability); $stability = iconv(\'utf-8\',\'gbk\',$stability); echo "产品 Stability:"; var_dump($stability); $arr[\'stability\'] = $stability; echo \'<br/>\'; } if($tr_1 == "Purity"){ $purity = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html(); $purity = trim($purity); $purity = strip_tags($purity); $purity = iconv(\'utf-8\',\'gbk\',$purity); echo "产品 Purity:"; var_dump($purity); $arr[\'purity\'] = $purity; echo \'<br/>\'; } if($tr_1 == "Amino acid sequence" || $tr_1 == "Amino Acid Sequence"){ $amino_acid = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html(); $amino_acid = trim($amino_acid); $amino_acid = strip_tags($amino_acid); $amino_acid = iconv(\'utf-8\',\'gbk\',$amino_acid); echo "产品 Amino acid sequence:"; var_dump($amino_acid); $arr[\'amino_acid\'] = $amino_acid; echo \'<br/>\'; } if($tr_1 == "Solubility"){ $solubility = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html(); $solubility = trim($solubility); $solubility = strip_tags($solubility); $solubility = iconv(\'utf-8\',\'gbk\',$solubility); echo "产品 Solubility :"; var_dump($solubility); $arr[\'solubility\'] = $solubility; echo \'<br/>\'; } if($tr_1 == "Biological Activity"){ $bio_activity = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html(); $bio_activity = trim($bio_activity); $bio_activity = strip_tags($bio_activity); $bio_activity = iconv(\'utf-8\',\'gbk\',$bio_activity); echo "产品 Biological Activity:"; var_dump($bio_activity); $arr[\'bio_activity\'] = $bio_activity; echo \'<br/>\'; } // var_dump($tr_1); echo \'<br/>\'; } # 写入文件 $handle = fopen(\'Neurotrophins.csv\',\'a\'); fputcsv($handle,$arr); fclose($handle); mysql_close($conn); unset($artlist); unset($arr); ?> <script> function JumpUrl(){ location.href=\'?id=<?php echo ($id+1);?>\'; } setTimeout(JumpUrl,0); </script>
说明,此次采集对phpQuery方法采集数据做了局部的优化,使我对此方法有了更搞的认识。好方法是成功的一半。
同时也有部分不足,毕竟该采集方法是针对源码的代码处理,采集代码根据页面的排版决定的,所以并不是通用型,
这在以后的学习工程中,还要继续优化和完善。学无止境,加油!