kkdadao

豆瓣地址“https://movie.douban.com/top250

导入包

import requests
import random
from lxml import etree
import csv

第1步,分析url特点
https://movie.douban.com/top250?start=225&filter=
https://movie.douban.com/top250
可以将start和filter传入到params,用于构造新的url
url = "https://movie.douban.com/top250"
for i in range(0, 250, 25):
params = {
"start": i,
"filter": ""
}
html = get_url(url,params)

第2步:构造请求函数

def get_url(url,params):
User_Agent = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36",
":Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11"
]
ips = [
{"http":"123.55.98.4:9999","http":"182.34.100.21:9999"}
]
headers = {"User-Agent":random.choice(User_Agent)}
response = requests.get(url,headers=headers,proxies=random.choice(ips),params=params)
html = response.text
return html
第3步:解析数据
def parse_html(html):
tree = etree.HTML(html)
lis = tree.xpath("//ol[@class=\'grid_view\']")
for li in lis:
names = li.xpath(".//div[@class=\'hd\']/a/span[1]/text()")
actors = li.xpath(".//div[@class=\'bd\']/p/text()[1]")
types = li.xpath(".//div[@class=\'bd\']/p/text()[2]")
comments = li.xpath(".//div[@class=\'star\']/span[2]/text()")
comments_nums = li.xpath(".//div[@class=\'star\']/span[4]/text()")
quotes = li.xpath(".//p[@class=\'quote\']/span/text()")
第4步:数据清洗:
actors, types返回数据比较杂乱需要清洗
数据中有“\n”,空格,使用strip(),remove()清除不需要的元素
def del_n(list):
a = []
for i in list:
a.append(i.strip())
for i in a:
if len(i) == 0:
a.remove(i)
return a
actoers,types 调用del_n()
actors = li.xpath(".//div[@class=\'bd\']/p/text()[1]")
actor = del_n(actors)
types = li.xpath(".//div[@class=\'bd\']/p/text()[2]")
type = del_n(types)
对type包含的元素进行切割,并返回列表,方便后续存储到csv文件
year = []
country = []
itype = []
for i in type:
i_year = i.split("/")[0]
i_country = i.split("/")[1]
i_type_clas = i.split("/")[2]
year.append(i_year)
country.append(i_country)
itype.append(i_type_clas)

第5步:存储数据
在主函数中实例化一个csv对象并切入表头
if __name__ == "__main__":
with open("dianyin.csv",\'w\',encoding="utf-8",errors=\'ignore\') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["名称","导演","年份","国家","类型","评价","评价人数","简介"])
使用zip()函数,将解析得到的内容打包成一个新列表,方面存储
contents = zip(names,actor,year,country,itype,comments,comments_nums,quotes)
写入到csv文件
for content in contents:
writer.writerows([content])

最后调用
parse_html(html)

全部代码如下:
import requests
import random
from lxml import etree
import csv

def get_url(url,params):
User_Agent = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36",
":Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11"
]
ips = [
{"http":"123.55.98.4:9999","http":"182.34.100.21:9999"}
]
headers = {"User-Agent":random.choice(User_Agent)}
response = requests.get(url,headers=headers,proxies=random.choice(ips),params=params)
html = response.text
return html

def del_n(list):
a = []
for i in list:
a.append(i.strip())
for i in a:
if len(i) == 0:
a.remove(i)
return a

def parse_html(html):
tree = etree.HTML(html)
lis = tree.xpath("//ol[@class=\'grid_view\']")
for li in lis:
names = li.xpath(".//div[@class=\'hd\']/a/span[1]/text()")
actors = li.xpath(".//div[@class=\'bd\']/p/text()[1]")
actor = del_n(actors)
types = li.xpath(".//div[@class=\'bd\']/p/text()[2]")
type = del_n(types)
year = []
country = []
itype = []
for i in type:
i_year = i.split("/")[0]
i_country = i.split("/")[1]
i_type_clas = i.split("/")[2]
year.append(i_year)
country.append(i_country)
itype.append(i_type_clas)
comments = li.xpath(".//div[@class=\'star\']/span[2]/text()")
comments_nums = li.xpath(".//div[@class=\'star\']/span[4]/text()")
quotes = li.xpath(".//p[@class=\'quote\']/span/text()")
contents = zip(names,actor,year,country,itype,comments,comments_nums,quotes)
for content in contents:
writer.writerows([content])

if __name__ == "__main__":
with open("dianyin.csv",\'w\',encoding="utf-8",errors=\'ignore\') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["名称","导演","年份","国家","类型","评价","评价人数","简介"])
url = "https://movie.douban.com/top250"
for i in range(0, 250, 25):
params = {
"start": i,
"filter": ""
}
html = get_url(url,params)
parse_html(html)

分类:

技术点:

相关文章: