前言:
需要在百度AI平台注册登录并创建项目。
爬虫代码
1 import scrapy 2 from BaiDuAi.items import BaiduaiItem 3 4 class AiSpider(scrapy.Spider): 5 name = \'ai\' 6 # allowed_domains = [\'www.xxx.com\'] 7 #人民网url 8 start_urls = [\'http://politics.people.com.cn/n1/2018/1217/c1001-30470023.html\'] 9 10 def parse(self, response): 11 title=response.xpath(\'/html/body/div[4]/h1/text()\').extract_first() 12 content=response.xpath(\'//*[@id="rwb_zw"]//text()\').extract() 13 content=\'\'.join(content).strip(\'\n \t\') 14 item=BaiduaiItem() 15 item[\'title\']=title 16 item[\'content\']=content 17 18 yield item
管道代码
1 from aip import AipNlp 2 3 """ 你的 APPID AK SK """ 4 APP_ID = \'15198150\' 5 API_KEY = \'jaObSr6rmSmqsjWfKGGpmwxB\' 6 SECRET_KEY = \'808Eiz4FPkfMwS2ajClXYhKrcFMN1YUN\' 7 8 client = AipNlp(APP_ID, API_KEY, SECRET_KEY) 9 10 class BaiduaiPipeline(object): 11 keys=[] 12 def process_item(self, item, spider): 13 title=item[\'title\'].replace(\'\xa0\',\'\') 14 content=item[\'content\'].replace(\'\xa0\',\'\') 15 keys_dict=client.keyword(title,content) 16 for dic in keys_dict[\'items\']: 17 self.keys.append(dic[\'tag\']) 18 19 20 keys="/".join(self.keys) 21 typec_dic=client.topic(title,content) 22 news_type=typec_dic[\'item\'][\'lv1_tag_list\'][0][\'tag\'] 23 24 with open(\'./xinwen.html\',\'w\',encoding=\'utf-8\')as fp: 25 fp.write(title+\'\n\n\'+content+\'\n\n\'+keys+\'\n\n\'+news_type) 26 return item