这里有两个爬虫的实例,是刚开始学python用的,一个是爬取京东茅台酒评论的,另一个是爬取新浪网国内新闻的,两个都是网上的教程里边的,代码略微有些不同,供参考学习。
都可以在andconda里跑
import requests from bs4 import BeautifulSoup from datetime import datetime import re import json import pandas news_total=[] commentURL=\'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20\' url=\'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1509373193047\' def parseListLinks(url): newsdetails=[] res = requests.get(url) jd= json.loads(res.text.strip().lstrip(\'newsloadercallback(\').rstrip(\');\')) for ent in jd[\'result\'][\'data\']: newsdetails.append(getNewsDetail(ent[\'url\'])) return newsdetails def getNewsDetail(newsurl): result={} res=requests.get(newsurl) res.encoding=\'utf-8\' soup=BeautifulSoup(res.text,\'html.parser\') result[\'title\']=soup.select(\'#artibodyTitle\')[0].text result[\'newssource\']=soup.select(\'.time-source span a\')[0].text timesource=soup.select(\'.time-source\')[0].contents[0].strip() dt1=datetime.strptime(timesource,\'%Y年%m月%d日%H:%M\') result[\'dt\'] =dt1.strftime(\'%Y-%m-%d-%H:%M\') result[\'article\']=\' \'.join([p.text.strip() for p in soup.select(\'#artibody p\')[:-1]]) result[\'editor\']=soup.select(\'.article-editor\')[0].text.strip(\'责任编辑:\') result[\'comments\']=getCommentCounts(newsurl) print(\'获得一条新闻\') return result def getCommentCounts(newsurl): m=re.search(\'doc-i(.+).shtml\',newsurl) newsid=m.group(1) comments=requests.get(commentURL.format(newsid)) jd=json.loads(comments.text.strip(\'var data=\')) return jd[\'result\'][\'count\'][\'total\'] for i in range(1,8): print(\'正在爬取第\'+str(i)+\'页......\') newsurl=url.format(i) newsary= parseListLinks(newsurl) news_total.extend(newsary) print(\'抓取结束\') df=pandas.DataFrame(news_total) df.to_excel(\'news.xlsx\')
import requests import re import json import time import xlwt # # #配置表格 #不需要明白是干啥的 #有下面4行代码就可以往表格写中文了 # style=xlwt.XFStyle() font=xlwt.Font() font.name=\'SimSun\' style.font=font #创建一个表格 w=xlwt.Workbook(encoding=\'utf-8\') #添加个sheet ws=w.add_sheet(\'sheet 1\',cell_overwrite_ok=True) #当前写入表格到第 row行 row=1 # #写入表格头 # ws.write(0,0,\'content\') ws.write(0,1,\'userClientShow\') ws.write(0,2,\'creationTime\') ws.write(0,3,\'userLevelName\') ws.write(0,4,\'productColor\') ws.write(0,5,\'userLevelId\') ws.write(0,6,\'score\') ws.write(0,7,\'referenceName\') ws.write(0,8,\'referenceTime\') ws.write(0,9,\'isMobile\') ws.write(0,10,\'nickname\') # #接受一个json对象 #将内容写进表格 #一次一页评论 # def write_json_to_xls(dat): global row for comment in dat[\'comments\']: ws.write(row,0,comment[\'content\']) ws.write(row,1,comment[\'userClientShow\']) ws.write(row,2,comment[\'creationTime\']) ws.write(row,3,comment[\'userLevelName\']) ws.write(row,4,comment[\'productColor\']) ws.write(row,5,comment[\'userLevelId\']) ws.write(row,6,comment[\'score\']) ws.write(row,7,comment[\'referenceName\']) ws.write(row,8,comment[\'referenceTime\']) ws.write(row,9,comment[\'isMobile\']) ws.write(row,10,comment[\'nickname\']) row+=1 # # # 循环获取数据 # # for i in range(1,10+1): url=\'https://club.jd.com/comment/productPageComments.action?productId=1475512465&score=0&sortType=5&page=%d&pageSize=100&isShadowSku=0&fold=\' % i try: json_req = requests.get(url) dat = json_req.json() write_json_to_xls(dat) print(u\'写入一页数据\') except Exception as e: print(u\'获取数据失败数据\',e) time.sleep(0.5) #将数据存进表格 w.save(\'result.xls\')