因为最近需要一批数据来做机器学习,所以用火车头采集器来抓数据,数据伪原创用的小发猫的API。以下是PHP实现代码:
<?php
set_time_limit(270);
error_reporting(E_ERROR | E_WARNING | E_PARSE);
define(\'TITLE_SEPAR\', \'xxx**xxx\');
define(\'TITLE_SEPAR2\', \'262661\');
$url = \'http://api-6.xiaofamao.com/api.php?json=0&v=1&key=\';
$content_tag_name = \'内容\';
$headdd = \'<figure class="wp-block-gallery columns-3 is-cropped"><ul class="blocks-gallery-grid">\';
$taill = \'</figure>\';
switch($LabelArray[\'PageType\'])
{
case \'List\'://处理列表页,只能处理html
break;
case \'Pages\'://处理多页,只能处理html
break;
case \'Content\'://处理默认页,只能处理html
break;
case \'Save\'://只有保存时是可以处理标签值的
// 保存原文
try {
/**********************************************************************/
// 这一步用来获取伪原创文章
/**********************************************************************/
$title = $LabelArray[\'标题\'];
$content = $LabelArray[$content_tag_name];
$article_src = compose_article($title, $content);
$article_src_b = $article_src;
//$article_src = br2newline($article_src);
$article_new = get_wyc_article($article_src);
$title_wyc = trim($article_new[0]);
$content_wyc = trim($article_new[1]);
//$article_new_x = $article_new;
//$article_new = fix_newline($article_new);
//$temp = explode(TITLE_SEPAR, $article_new);
//$new_title = $temp[0];
//$new_title = fix_title($new_title);
/*
$temp[1] = ltrim($temp[1], "\r\n");//
$temp[1] = ltrim($temp[1], "\n");
$temp[1] = ltrim($temp[1], "\r\n");//implode(PHP_EOL, $temp);
$temp[1] = ltrim($temp[1], "\n");*/
//$new_article = get_wyc_article($LabelArray[$content_tag_name]);
$content_wyc = fix_newline($content_wyc);
// $new_article = newline2br($new_article);
//$new_article = remove_alt($new_article);
//$article_new = xfm_strong_str_replace_once(\'<p>\', \'<p>\'.$new_title, $new_article);
//$LabelArray[$content_tag_name] = $article_new;//$new_article;//$new_article;
//$nlp = get_keywords($new_title, $new_article);
//$nlp_arr = explode(TITLE_SEPAR, $nlp);
//$LabelArray[\'关键词\'] = $nlp_arr[0];
//$LabelArray[\'内容简介\'] = $nlp_arr[1];
//$LabelArray[\'内容简介\'] = curl_request($url, array(\'wenzhang\'=>$LabelArray[\'内容简介\']));
$content_wyc = ltrim($content_wyc, \'</p>\');
//$LabelArray[$content_tag_name] = $headdd. $content_wyc. $taill; //serialize($article_new);
// $LabelArray[$content_tag_name] = $temp[1];
//$LabelArray[$content_tag_name] = $article_src;
$new_title = str_replace(array(\'[\',\']\',\'%\'), array(\'【\',\'】\',\'%\'), $new_title);
$LabelArray[\'标题\'] = strip_tags($title_wyc);
$LabelArray[\'标题\'] = ltrim($LabelArray[\'标题\']);
$LabelArray[\'标题\'] = trim($LabelArray[\'标题\']);
//$LabelArray[\'摘要\'] = curl_request($url, array(\'wenzhang\'=>$LabelArray[\'标题\'].\',\'.$LabelArray[\'摘要\']));
}
catch (Exception $e) {
$LabelArray[\'标题\'] .= $e->getMessage();
$LabelArray[$content_tag_name] .= $e->getMessage();
}
break;
default:
//$LabelArray[$content_tag_name]=curl_request($url, array(\'wenzhang\'=>$LabelArray[$content_tag_name] ));
}
echo serialize($LabelArray);
function compose_article($title, $content) {
$separator = compose_separator();
return $title.$separator.$content;
}
function compose_separator() {
return PHP_EOL.\'(\'.TITLE_SEPAR2.\')\'.PHP_EOL;
}
function fix_separator($article) {
return $article;
}
function get_wyc_article($str) {
global $url;
$separator = compose_separator();
$separator = str_replace(PHP_EOL, \'\', $separator);
$wyc = curl_request($url, array(\'wenzhang\'=>$str));
$wyc = fix_separator($wyc);
$wyc = explode($separator, $wyc);
if (isset($wyc[0])) $wyc[0] = trim($wyc[0]);
if (isset($wyc[1])) $wyc[1] = trim($wyc[1]);
return $wyc;
}
function get_wyc_title($str) {
$title = get_wyc_article($str.PHP_EOL.PHP_EOL.PHP_EOL.$str.PHP_EOL.PHP_EOL.PHP_EOL.$str);
$title = fix_newline($title);
$title = explode(PHP_EOL, $title);
return $title[0];
}
function get_keywords($title, $contents) {
$url_kw = \'http://api-2.78tp.com/nlp/kws.php?appid=\';
$kws = curl_request($url_kw, array(
\'title\'=>$title,
\'len\'=>100,
\'text\'=>$contents));
return $kws;
}
function remove_alt($contents) {
$contents = preg_replace(\'/alt=\"(.*)\"/\', \'\', $contents);
return $contents;
}
function fix_title($contents) {
$punctuation_symbol = array(\'。\', \'?\', \',\', \':\', \';\', \'、\', \'!\',
\'.\', \'?\', \',\', \':\', \';\', \'!\');
$contents = str_replace($punctuation_symbol, \'\', $contents);
return $contents;
}
function br2newline($contents) {
$contents = str_replace(\'<br>\', PHP_EOL, $contents);
$contents = str_replace(\'<br/>\', PHP_EOL, $contents);
$contents = str_replace(\'<br />\', PHP_EOL, $contents);
$contents = str_replace(\'<BR/>\', PHP_EOL, $contents);
$contents = str_replace(\'<BR>\', PHP_EOL, $contents);
$contents = str_replace(\'<BR />\', PHP_EOL, $contents);
return $contents;
}
function newline2br($contnets) {
$contnets = str_replace(PHP_EOL, "<br>", $contnets);
// $contnets = str_replace(\'><br><\', \'><\', $contnets);
$contnets = str_replace(\'<p><br>\', \'<p>\', $contnets);
return $contnets;
}
function delete_newline($contents) {
$contents = fix_newline($contents);
// $contents = str_replace(PHP_EOL.PHP_EOL, PHP_EOL, $contents);
// $contents = str_replace(\'>\'.PHP_EOL, \'>\', $contents);
return $contents;
}
function reset_newline_win($contents) {
// 优化换行符
$contents = str_replace("\r\n", "\n", $contents);
$contents = str_replace("\r", "\n", $contents);
$contents = str_replace("\n", PHP_EOL, $contents);
return $contents;
}
function fix_newline($data) {
$data = str_replace("\r", "\n", $data);
while(strpos($data, "\n\n") !== false) {
$data = str_replace("\n\n", "\n", $data);
}
$data = str_replace("\n", PHP_EOL, $data);
return $data;
}
function clean_contents($contents) {
// $str = preg_replace(\'#<([^>\s/]+)[^>]*>#\',\'<$1>\', $contents);
// return $str;
$sa = new cleanHtml;
$sa->allow = array( \'src\' );
$sa->exceptions = array(
\'img\' => array( \'src\', \'alt\' ),
//\'a\' => array( \'href\', \'title\' ),
\'iframe\'=>array(\'src\',\'frameborder\'),
);
$str = $sa->strip( $contents );
return $str;
}
function xfm_strong_str_replace_once($search, $replace, $subject) {
$firstChar = strpos($subject, $search);
if($firstChar !== false) {
$beforeStr = substr($subject,0,$firstChar);
$afterStr = substr($subject, $firstChar + strlen($search));
return $beforeStr.$replace.$afterStr;
} else {
return $subject;
}
}
//参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies
function curl_request($url,$post=\'\',$cookie=\'\', $returnCookie=0){
if (! extension_loaded(\'curl\')) {
file_exists(\'./ext/php_curl.dll\') && dl(\'php_curl.dll\'); // 加载扩展
}
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, \'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)\');
if (ini_get(\'open_basedir\') == \'\' && strtolower(ini_get(\'safe_mode\')) != \'on\'){
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
}
curl_setopt($curl, CURLOPT_AUTOREFERER, 1);
curl_setopt($curl, CURLOPT_REFERER, "http://XXX");
if($post) {
curl_setopt($curl, CURLOPT_POST, 1);
curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($post));
}
if($cookie) {
curl_setopt($curl, CURLOPT_COOKIE, $cookie);
}
curl_setopt($curl, CURLOPT_HEADER, $returnCookie);
curl_setopt($curl, CURLOPT_TIMEOUT, 150);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$data = curl_exec($curl);
if (curl_errno($curl)) {
return curl_error($curl);
}
curl_close($curl);
if($returnCookie){
list($header, $body) = explode("\r\n\r\n", $data, 2);
preg_match_all("/Set\-Cookie:([^;]*);/", $header, $matches);
$info[\'cookie\'] = substr($matches[1][0], 1);
$info[\'content\'] = $body;
return $info;
}else{
return $data;
}
}
//echo $tag;
// 计算中文字符串长度
function utf8_strlen($string = null) {
// 将字符串分解为单元
preg_match_all("/./us", $string, $match);
// 返回单元个数
return count($match[0]);
}
function reg_escape( $str )
{
$conversions = array( "^" => "\^", "[" => "\[", "." => "\.", "$" => "\$", "{" => "\{", "*" => "\*", "(" => "\(", "\\" => "\\\\", "/" => "\/", "+" => "\+", ")" => "\)", "|" => "\|", "?" => "\?", "<" => "\<", ">" => "\>" );
return strtr( $str, $conversions );
}
/**
* Strip attribute Class
* Remove attributes from XML elements
* @author David (semlabs.co.uk)
* @version 0.2.1
*/
class cleanHtml{
public $str = \'\';
public $allow = array();
public $exceptions = array();
public $ignore = array();
public function strip( $str )
{
$this->str = $str;
if( is_string( $str ) && strlen( $str ) > 0 )
{
$res = $this->findElements();
if( is_string( $res ) )
return $res;
$nodes = $this->findAttributes( $res );
$this->removeAttributes( $nodes );
}
return $this->str;
}
private function findElements()
{
# Create an array of elements with attributes
$nodes = array();
preg_match_all( "/<([^ !\/\>\n]+)([^>]*)>/i", $this->str, $elements );
foreach( $elements[1] as $el_key => $element )
{
if( $elements[2][$el_key] )
{
$literal = $elements[0][$el_key];
$element_name = $elements[1][$el_key];
$attributes = $elements[2][$el_key];
if( is_array( $this->ignore ) && !in_array( $element_name, $this->ignore ) )
$nodes[] = array( \'literal\' => $literal, \'name\' => $element_name, \'attributes\' => $attributes );
}
}
# Return the XML if there were no attributes to remove
if( !$nodes[0] )
return $this->str;
else
return $nodes;
}
private function findAttributes( $nodes )
{
# Extract attributes
foreach( $nodes as &$node )
{
preg_match_all( "/([^ =]+)\s*=\s*[\"|\']{0,1}([^\"\']*)[\"|\']{0,1}/i", $node[\'attributes\'], $attributes );
if( $attributes[1] )
{
foreach( $attributes[1] as $att_key => $att )
{
$literal = $attributes[0][$att_key];
$attribute_name = $attributes[1][$att_key];
$value = $attributes[2][$att_key];
$atts[] = array( \'literal\' => $literal, \'name\' => $attribute_name, \'value\' => $value );
}
}
else
$node[\'attributes\'] = null;
$node[\'attributes\'] = $atts;
unset( $atts );
}
return $nodes;
}
private function removeAttributes( $nodes )
{
# Remove unwanted attributes
foreach( $nodes as $node )
{
# Check if node has any attributes to be kept
$node_name = $node[\'name\'];
$new_attributes = \'\';
if( is_array( $node[\'attributes\'] ) )
{
foreach( $node[\'attributes\'] as $attribute )
{
if( ( is_array( $this->allow ) && in_array( $attribute[\'name\'], $this->allow ) ) || $this->isException( $node_name, $attribute[\'name\'], $this->exceptions ) )
$new_attributes = $this->createAttributes( $new_attributes, $attribute[\'name\'], $attribute[\'value\'] );
}
}
$replacement = ( $new_attributes ) ? "<$node_name $new_attributes>" : "<$node_name>";
$this->str = preg_replace( \'/\'. reg_escape( $node[\'literal\'] ) .\'/\', $replacement, $this->str );
}
}
private function isException( $element_name, $attribute_name, $exceptions )
{
if( array_key_exists($element_name, $this->exceptions) )
{
if( in_array( $attribute_name, $this->exceptions[$element_name] ) )
return true;
}
return false;
}
private function createAttributes( $new_attributes, $name, $value )
{
if( $new_attributes )
$new_attributes .= " ";
$new_attributes .= "$name=\"$value\"";
return $new_attributes;
}
}
?>
我们选择方法1:“保存到软件数据库”,同时,选择模式3“网上发布到网站”的“使用自定义发布方式”,选择3“自定义分类标识”,将任务命名为“房地产”,将收藏任务命名为“保存并更新”。由于我们的教程刚刚开始,我们不会做深入的研究。
返回机车主界面,在“房地产”任务上点击鼠标右键,选择“开始”完成采集。收集的数据将自动发布到模式3中指向的网站的指定列(标识=3),并保存到:机车安装目录/数据/序列号-任务名称/蜘蛛结果. mdb在的数据库中。
哦,昨天网络给了我一个关于我的错误的提示,我必须写文案,录像,并收集信息到我的网站3个小时。我晕倒过几次。太仓的作品很粗糙。这完全是凭感觉写的。这让雾中的每个人都很困惑。对不起,请原谅我!现在更正以下内容:
这里,方法1和方法3是并行关系,可以同时选择,也可以选择其中一个,如果不发布模块,可以直接收集本地软件数据库。“本地软件数据库”来自微软Access。我们可以打开数据库来浏览和检查数据。
至于模式3,“火车头采集器伪原创”,我将在下面的教程中解释。我希望每个人都能耐心等待。
好了,本教程到此结束!下一课,再见!