wuheng1991

a.要采集的源链接:

http://www.prospecbio.com/Recombinant_Proteins/

b.具体要求:

接下来就是采集代码的编写。

对于:b-(1)中,代码如下:

<?php
header(\'Content-Type:text/html;charset=UTF-8\');
include \'./phpQuery/phpQuery.php\';
set_time_limit(10000);

$url = "http://www.prospecbio.com/Hormones/";
echo "当前的URL:";
echo $url."<br/>";
phpQuery::newDocumentFile($url);
$artList = pq(".Body");
// var_dump($artList);
$li = \'\';
foreach($artList as $li){
    $path = \'\';
    $head = \'\';
    $head = \'http://www.prospecbio.com\';
    $tr = \'\';
    $tr = pq($li)->eq(0)->find("table")->eq(0)->find("tr")->eq(0)->find("td")->eq(0)->find("a")->eq(0)->attr(\'href\');
    $tr = trim($tr);
    
    
    if($tr != \'\'){
    $path = $head.$tr;
    // var_dump($path);
    $path .= "\r\n";
    file_put_contents(\'Url.txt\',$path,FILE_APPEND);
    }
     
    // exit;

}


?>

对于:b-(2)中代码如下:

<?php
header(\'Content-Type:text/html;charset=UTF-8\');
include \'./phpQuery/phpQuery.php\';
set_time_limit(10000);
$id = isset($_GET[\'id\']) ? intval($_GET[\'id\']) : 1;

if($id > 14){
   echo "finish!";
   exit;
}
echo "当前 id=".$id;
echo "<br/>";

$conn = mysql_connect ( "localhost", "root", "root" ) or die ( "连接服务器失败 !!!" );
$db = mysql_select_db ( "prospect" ) or die ( "选择数据库失败 !!!" );
// var_dump($conn,$db);
$url = \'\';
$sql = \'\';
$sql = \'select url from url_a where id =\'.$id;
echo "当前sql :".$sql;
echo \'<br/>\';
$query = mysql_query($sql);
$res = mysql_fetch_assoc($query);
$url = trim($res[\'url\']);
echo "当前的url:".$url;
echo \'<br/>\';

phpQuery::newDocumentFile($url);

$artList = pq(".Body");
// var_dump($artList);
$li = \'\';
foreach($artList as $k => $li){

$tr = \'\';
$tr = pq($li)->eq(0)->find(\'table\')->eq(0)->find(\'tr\')->eq(1)->find(\'td\')->eq(0)->find(\'a\')->attr(\'href\');
$tr = trim($tr);


if($tr !== \'\' and $k > 0){

    $head = \'\';
    $head = \'http://www.prospecbio.com\';

    $path = \'\';
    $tr = ltrim($tr,".");
    $path = $head.$tr."\r\n";

    var_dump($path);
    echo \'<br/>\';

    file_put_contents(\'Url_a.txt\',$path,FILE_APPEND);
    

}


}

mysql_close($conn);
unset($artList);


?>
<script>
function JumpUrl(){
    location.href=\'?id=<?php echo ($id+1);?>\';
}
setTimeout(\'JumpUrl()\',0);
</script>

对于:b-(3)中,代码如下:

<?php
header(\'Content-Type:text/html;charset=gb2312\');
include \'./phpQuery/phpQuery.php\';
set_time_limit(100000);
// $id = isset($_GET[\'id\']) ? intval($_GET[\'id\']) : 1290;
// $id = isset($_GET[\'id\']) ? intval($_GET[\'id\']) : 2;
$id = isset($_GET[\'id\']) ? intval($_GET[\'id\']) : 1;

if($id > 63){
   echo "finish!";
   exit;
}
echo "当前 id=".$id;
echo "<br/>";

$conn = mysql_connect ( "localhost", "root", "root" ) or die ( "连接服务器失败 !!!" );
$db = mysql_select_db ( "prospect" ) or die ( "选择数据库失败 !!!" );
// var_dump($conn,$db);
$url = \'\';
$sql = \'\';
$sql = \'select url from url_b where id =\'.$id;
echo "当前sql :".$sql;
echo \'<br/>\';
$query = mysql_query($sql);

$res = mysql_fetch_assoc($query);
$url = trim($res[\'url\']);
echo "当前的url:".$url;
echo \'<br/>\';
//$url_wh = "http://www.prospecbio.com/CAPN2_10_455/";
phpQuery::newDocumentFile($url);
//phpQuery::newDocumentFile($url_wh);
$arr = array();  ###用于装载产品信息
$arr[\'product_url\'] = \'\';
$arr[\'product_url\'] = $url;

### Product Name
$product_name = \'\';
$product_name = pq(\'#PageHeader\')->eq(0)->find(\'span\')->eq(0)->html();
// var_dump($a); 
$product_name = trim($product_name);
echo "产品名称:";
var_dump($product_name);

$arr[\'product_name\'] = \'\';
$arr[\'product_name\'] = $product_name;
echo \'<br/>\';

#### price
$price = \'\';
$price_1 = \'\';
$price_2 = \'\';
$price_3 = \'\';

$price_1 = pq(\'.ProductsColumnPrice\')->find(\'table\')->eq(0)->find(\'tr\')->eq(0)->find(\'td\')->eq(1)->find(\'label\')->eq(0)->html();
$price_1 = trim($price_1);
iconv(\'utf-8\',\'gbk\',$price_1);

$price_2 = pq(\'.ProductsColumnPrice\')->find(\'table\')->eq(0)->find(\'tr\')->eq(1)->find(\'td\')->eq(1)->find(\'label\')->eq(0)->html();
$price_2 = trim($price_2);
iconv(\'utf-8\',\'gbk\',$price_2);

$price_3 = pq(\'.ProductsColumnPrice\')->find(\'table\')->eq(0)->find(\'tr\')->eq(2)->find(\'td\')->eq(1)->find(\'label\')->eq(0)->html();
$price_3 = trim($price_3);
iconv(\'utf-8\',\'gbk\',$price_3);


$price = $price_1."/".$price_2."/".$price_3;

iconv(\'utf-8\',\'gbk\',$price);

echo "产品价格:";
var_dump($price);
$arr[\'price\'] = \'\';
$arr[\'price\'] = $price;
echo \'<br/>\';

########## Catalogue Number
$cata_num = \'\';
$cata_num = pq(\'.ItemRowLastCellStyle\')->eq(0)->html();
$cata_num = trim($cata_num);
echo "产品 Catalogue Number:";
var_dump($cata_num);

$arr[\'cata_num\'] = \'\';
$arr[\'cata_num\'] = $cata_num;
echo \'<br/>\';

############ Source 
$source = \'\';
$appearance = \'\';
$formulation = \'\';
$stability = \'\';
$purity = \'\';
$amino_acid = \'\';

$solubility = \'\';
$bio_activity = \'\';

$artlist = \'\';
$artlist = pq(\'.ItemRowFirstCellStyle\');

$arr[\'source\'] = \'\';
$arr[\'appearance\'] = \'\';
$arr[\'formulation\'] = \'\';
$arr[\'stability\'] = \'\';
$arr[\'solubility\'] = \'\';
$arr[\'purity\'] = \'\';
$arr[\'amino_acid\'] = \'\';
$arr[\'bio_activity\'] = \'\';

foreach($artlist as $k => $li){
    $tr_1 = \'\';

    $tr_1 = pq($li)->eq(0)->find(\'span\')->eq(0)->html();
    $tr_1 = trim($tr_1);

    if($tr_1 == "Source"){

     $source = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html();
     $source = trim($source);
     $source = strip_tags($source);
     $source = iconv(\'utf-8\',\'gbk\',$source);
     echo "产品 Source:";
     var_dump($source);
    
     $arr[\'source\'] = $source;
     echo \'<br/>\';
     // var_dump($k,$tr_1,$source);
    }
    
    if($tr_1 == "Physical Appearance"){
     $appearance = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html();
     $appearance = trim($appearance);
     $appearance = strip_tags($appearance);
     $appearance = iconv(\'utf-8\',\'gbk\',$appearance);
     echo "产品 Physical Appearance:";
     var_dump($appearance);
    
     $arr[\'appearance\'] = $appearance;
     echo \'<br/>\';
    }

    if($tr_1 == "Formulation"){
     $formulation = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html();
     $formulation = trim($formulation);
     $formulation = strip_tags($formulation);
     $formulation = iconv(\'utf-8\',\'gbk\',$formulation);
     echo "产品 Formulation:";
     var_dump($formulation);
     
     $arr[\'formulation\'] = $formulation;
     echo \'<br/>\';
    }

    if($tr_1 == "Stability"){
     $stability = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html();
     $stability = trim($stability);
     $stability = strip_tags($stability);
     $stability = iconv(\'utf-8\',\'gbk\',$stability);
     echo "产品 Stability:";
     var_dump($stability);
     
     $arr[\'stability\'] = $stability;
     echo \'<br/>\';
    }

    if($tr_1 == "Purity"){
     $purity = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html();
     $purity = trim($purity);
     $purity = strip_tags($purity);
     $purity = iconv(\'utf-8\',\'gbk\',$purity);
     echo "产品 Purity:";
     var_dump($purity);
     
     $arr[\'purity\'] = $purity;
     echo \'<br/>\';
    }

    if($tr_1 == "Amino acid sequence" || $tr_1 == "Amino Acid Sequence"){
     $amino_acid = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html();
     $amino_acid = trim($amino_acid);
     $amino_acid = strip_tags($amino_acid);
     $amino_acid = iconv(\'utf-8\',\'gbk\',$amino_acid);
     echo "产品 Amino acid sequence:";
     var_dump($amino_acid);
     
     $arr[\'amino_acid\'] = $amino_acid;
     echo \'<br/>\';
    }

    if($tr_1 == "Solubility"){
     $solubility = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html();
     $solubility = trim($solubility);
     $solubility = strip_tags($solubility);
     $solubility = iconv(\'utf-8\',\'gbk\',$solubility);
     echo "产品 Solubility :";
     var_dump($solubility);
     
     $arr[\'solubility\'] = $solubility;
     echo \'<br/>\';
    }


    if($tr_1 == "Biological Activity"){
     $bio_activity = pq(\'.ItemRowLastCellStyle\')->eq($k)->find(\'span\')->eq(0)->html();
     $bio_activity = trim($bio_activity);
     $bio_activity = strip_tags($bio_activity);
     $bio_activity = iconv(\'utf-8\',\'gbk\',$bio_activity);
     echo "产品 Biological Activity:";
     var_dump($bio_activity);
     
     $arr[\'bio_activity\'] = $bio_activity;
     echo \'<br/>\';
    }


   // var_dump($tr_1);
    
    echo \'<br/>\';
}

 # 写入文件 
$handle = fopen(\'Neurotrophins.csv\',\'a\');
fputcsv($handle,$arr);
fclose($handle);

mysql_close($conn);
unset($artlist);
unset($arr);
?>

<script>
function JumpUrl(){
   location.href=\'?id=<?php echo ($id+1);?>\';
}
setTimeout(JumpUrl,0);
</script>

说明,此次采集对phpQuery方法采集数据做了局部的优化,使我对此方法有了更搞的认识。好方法是成功的一半。

同时也有部分不足,毕竟该采集方法是针对源码的代码处理,采集代码根据页面的排版决定的,所以并不是通用型,

这在以后的学习工程中,还要继续优化和完善。学无止境,加油!

 

分类:

技术点:

相关文章:

  • 2022-12-23
  • 2021-05-31
  • 2022-01-07
  • 2022-01-02
  • 2022-01-16
  • 2021-05-06
  • 2021-12-29
  • 2021-12-31
猜你喜欢
  • 2022-01-07
  • 2021-05-23
  • 2021-07-03
  • 2021-10-18
  • 2022-01-07
  • 2021-04-15
  • 2021-09-22
相关资源
相似解决方案