微信公众号爬取
#Weichat
import scrapy
import urllib.parse
from news_project.middlewares import Deal_Content
from news_project.items import NewsProjectItem
from lxml import etree
import js2py
import time
import requests
import re
import bs4
class Weichat(scrapy.Spider):
name = \'Weichat\'
base = \'https://mp.weixin.qq.com\'
allowed_domains = [\'weixin.sogou.com\'] #允许的页面最好不要定义 http:// 这样的
start_urls = [\'http://weixin.sogou.com\']
#微信
def parse(self,response):
url_1 = \'https://weixin.sogou.com/weixin?type=1&query={}&ie=utf8&s_from=input&_sug_=y&_sug_type_=\'.format(\'宝鸡招商局\')
yield scrapy.Request(url=url_1, callback=self.detail_parse,dont_filter=True)
def detail_parse(self, response):
baoji_url = response.xpath("//a[@uigs=\'account_name_0\']/@href").extract_first()
print(\'baoji_url\',baoji_url)
print(\'baoji_url\', type(baoji_url))
yield scrapy.Request(url=baoji_url, callback=self.baoji_parse, dont_filter=True)
def baoji_parse(self,response):
selector = etree.HTML(response.text)
print("------------------宝鸡招商局---------------------",response.text)
script = selector.xpath(\'.//script[not(@nonce) and @type="text/javascript"]/text()\')
script = script[1]
script = script.replace(\'seajs.use("sougou/profile.js");\', \'\')
script = \'function getList(){\' + script + \'return msgList \n}\'
script = script.replace(\'amp;\', \'\')
getList = js2py.eval_js(script)
js = getList()
js = str(js)
js = eval(js)
lis = js.get(\'list\')
firstLinks = []
otherStyleTimes = []
for li in lis:
# 获取文章发布时间,转换为时间戳,格式化为数据库可以保存的格式
datimes = li[\'comm_msg_info\'][\'datetime\']
timeArray = time.localtime(datimes)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
# 找到文章url,后期构建真实url时用到
try:
content_url = li.get(\'app_msg_ext_info\').get(\'content_url\')
print(content_url)
firstLink = self.base + content_url
except IndexError:
firstLink = None
print(\'CAPTCHA!\')
firstLinks.append(firstLink)
otherStyleTimes.append(otherStyleTime)
print(\'firstLinks, otherStyleTimes***********************\',firstLinks, otherStyleTimes)
yield scrapy.Request(url=firstLinks[0], callback=self.baoji_detail_parse, meta={\'time\':otherStyleTimes[0]},dont_filter=True)
def baoji_detail_parse(self,response):
item = NewsProjectItem()
content = \'\'
meta = response.meta
print("response.url",response.url)
#res.raise_for_status()
detailPage = bs4.BeautifulSoup(response.text, "html.parser")
# 获取文章标题
title = detailPage.title.text.replace(\'\n\', \'\').replace(\'\r\', \'\').replace(\' \', \'\').replace(\'!\', \'\').replace(\'|\', \'\')
print(\'title\',title)
sections = detailPage.findAll(\'section\', class_=\'_editor\')
# 获取文章内容
for section in sections[:-3]:
content = content + section.text.replace(\'\n\', \'\').replace(\'\r\', \'\').replace(\' \', \'\')
content_1 = response.xpath("//div[@id=\'js_content\']")
print("meta[\'time\']**************",meta[\'time\'])
print("content---------------",content)
item[\'title_url\'] = response.url
# 详细页面的内容
etree = response.xpath(\'//div[@id="js_content"]\')
tagContet = etree.extract()
tagContet = \'\'.join(tagContet)
content = etree.xpath(\'.//text()\').extract()
content = \'\'.join(content)
img_urls = etree.xpath(\'.//img/@src\').extract()
img_urls_dict = {}
for url in img_urls:
if "http://网站" not in url:
url1 = urllib.parse.urljoin(response.url, url) # 拼接url的网址
img_urls_dict[url] = url1
print("*******img_urls_dict****", img_urls_dict)
item[\'content\'], item[\'tags\'] = Deal_Content.handleText(content, tagContet, img_urls_dict, title)
print("************item[\'tags\']********************", item[\'tags\'])
item[\'content\'] = item[\'content\'].replace(re.findall("font-size: 18px;(.*)",item[\'content\'])[0],\'\')
item[\'title\'] = title
item[\'time\'] = meta[\'time\']
#item[\'content\'] = content
id, pid = Deal_Content.sql_read(response.url)
item[\'id\'] =id
item[\'pid\'] = pid
item[\'type_cn\'] = "省市级"
# #news 新闻来源、是那一个网站, 主页
item[\'news\'] = \'宝鸡招商局\'
# type_no 就是 id
item[\'type_no\'] = 18
yield item