wwwhza

这里有两个爬虫的实例,是刚开始学python用的,一个是爬取京东茅台酒评论的,另一个是爬取新浪网国内新闻的,两个都是网上的教程里边的,代码略微有些不同,供参考学习。

都可以在andconda里跑

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import json
import pandas
news_total=[]
commentURL=\'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20\'
url=\'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1509373193047\'
def parseListLinks(url):
    newsdetails=[]
    res = requests.get(url)
    jd= json.loads(res.text.strip().lstrip(\'newsloadercallback(\').rstrip(\');\'))
    for ent in jd[\'result\'][\'data\']:
        newsdetails.append(getNewsDetail(ent[\'url\']))
    return newsdetails
        
def getNewsDetail(newsurl):
    result={}
    res=requests.get(newsurl)
    res.encoding=\'utf-8\'
    soup=BeautifulSoup(res.text,\'html.parser\')     
    result[\'title\']=soup.select(\'#artibodyTitle\')[0].text
    result[\'newssource\']=soup.select(\'.time-source span a\')[0].text
    timesource=soup.select(\'.time-source\')[0].contents[0].strip()
    dt1=datetime.strptime(timesource,\'%Y年%m月%d日%H:%M\')
    result[\'dt\'] =dt1.strftime(\'%Y-%m-%d-%H:%M\')
    result[\'article\']=\' \'.join([p.text.strip() for p in soup.select(\'#artibody p\')[:-1]])
    result[\'editor\']=soup.select(\'.article-editor\')[0].text.strip(\'责任编辑:\')
    result[\'comments\']=getCommentCounts(newsurl)
    print(\'获得一条新闻\')
    return result      
       
    
def getCommentCounts(newsurl):
    m=re.search(\'doc-i(.+).shtml\',newsurl)
    newsid=m.group(1)
    comments=requests.get(commentURL.format(newsid))
    jd=json.loads(comments.text.strip(\'var data=\'))
    return jd[\'result\'][\'count\'][\'total\'] 

for i in range(1,8):
    print(\'正在爬取第\'+str(i)+\'页......\')
    newsurl=url.format(i)
    newsary= parseListLinks(newsurl)
    news_total.extend(newsary)
print(\'抓取结束\')                                 
df=pandas.DataFrame(news_total)
df.to_excel(\'news.xlsx\')

 

import requests 
import re
import json
import time
import xlwt

#
#
#配置表格
#不需要明白是干啥的
#有下面4行代码就可以往表格写中文了
#
style=xlwt.XFStyle()
font=xlwt.Font()
font.name=\'SimSun\'
style.font=font

#创建一个表格
w=xlwt.Workbook(encoding=\'utf-8\')
#添加个sheet
ws=w.add_sheet(\'sheet 1\',cell_overwrite_ok=True)
#当前写入表格到第 row行
row=1
#
#写入表格头
#
ws.write(0,0,\'content\')
ws.write(0,1,\'userClientShow\')
ws.write(0,2,\'creationTime\')
ws.write(0,3,\'userLevelName\')
ws.write(0,4,\'productColor\')
ws.write(0,5,\'userLevelId\')
ws.write(0,6,\'score\')
ws.write(0,7,\'referenceName\')
ws.write(0,8,\'referenceTime\')
ws.write(0,9,\'isMobile\')
ws.write(0,10,\'nickname\')

#
#接受一个json对象
#将内容写进表格
#一次一页评论
#
def write_json_to_xls(dat):
    global row
    for comment in dat[\'comments\']:
        ws.write(row,0,comment[\'content\'])
        ws.write(row,1,comment[\'userClientShow\'])
        ws.write(row,2,comment[\'creationTime\'])
        ws.write(row,3,comment[\'userLevelName\'])
        ws.write(row,4,comment[\'productColor\'])
        ws.write(row,5,comment[\'userLevelId\'])
        ws.write(row,6,comment[\'score\'])
        ws.write(row,7,comment[\'referenceName\'])
        ws.write(row,8,comment[\'referenceTime\'])
        ws.write(row,9,comment[\'isMobile\'])
        ws.write(row,10,comment[\'nickname\'])
        row+=1

#
#
# 循环获取数据
#
#
for i in range(1,10+1):
    url=\'https://club.jd.com/comment/productPageComments.action?productId=1475512465&score=0&sortType=5&page=%d&pageSize=100&isShadowSku=0&fold=\' % i
    try:
        json_req = requests.get(url)
        dat = json_req.json()
        write_json_to_xls(dat)
        print(u\'写入一页数据\')
    except Exception as e:
       print(u\'获取数据失败数据\',e)
    time.sleep(0.5)


#将数据存进表格
w.save(\'result.xls\')

 

分类:

技术点:

相关文章: