import json
import requests
from requests.exceptions import RequestException
import re
import time
print("0")
def get_hot_movie_rank(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
print("1")
def parse_hot_movie_rank(html):
pattern = re.compile(\'<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a\'
+ \'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>\'
+ \'.*?realtime">.*?stonefont">(.*?)</span></span>(.*?)</p>\'
+ \'.*?total-boxoffice">.*?stonefont">(.*?)</span></span>(.*?)</p>.*?</dd>\',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
\'index\':item[0],
\'image\':item[1],
\'title\':item[2],
\'actor\':item[3].strip()[3:],
\'release time\':item[4].strip()[5:],
\'Real-time box office\':(item[5] + item[6]).strip() ,
\'Total box office\':(item[7] + item[8]).strip()
}
print("2")
def write_to_file(content):
with open(\'maoyan1.txt\',\'a\',encoding=\'utf-8\') as f:
f.write(json.dumps(content,ensure_ascii = False) + \'n\')
print("3")
def main():
url = \'https://maoyan.com/board/1\'
html = get_hot_movie_rank(url)
for item in parse_hot_movie_rank(html):
print(item)
write_to_file(item)
print("4")
if __name__ == \'__main__\':
main()
time.sleep(1)
print("5")
0
1
2
3
4
{\'index\': \'1\', \'image\': \'https://p0.meituan.net/moviemachine/f7d2ad70eb79d6d9b8a197713db9b8c41711752.jpg@160w_220h_1e_1c\', \'title\': \'复仇者联盟4:终局之战\', \'actor\': \'小罗伯特·唐尼,克里斯·埃文斯,马克·鲁法洛\', \'release time\': \'2019-04-24\', \'Real-time box office\': \'.亿\', \'Total box office\': \'.亿\'}
{\'index\': \'2\', \'image\': \'https://p1.meituan.net/movie/d28b729ffe72353a72d1e7ef8a9b90591544978.jpg@160w_220h_1e_1c\', \'title\': \'何以为家\', \'actor\': \'赞恩·阿尔·拉菲亚,约丹诺斯·希费罗,博鲁瓦蒂夫·特雷杰·班科尔\', \'release time\': \'2019-04-29\', \'Real-time box office\': \'.万\', \'Total box office\': \'.万\'}
{\'index\': \'3\', \'image\': \'https://p0.meituan.net/movie/29cebff7d3ed1cf98fbeb6b01c908e1b9947789.jpg@160w_220h_1e_1c\', \'title\': \'雪暴\', \'actor\': \'张震,廖凡,倪妮\', \'release time\': \'2019-04-30\', \'Real-time box office\': \'.万\', \'Total box office\': \'.万\'}
{\'index\': \'4\', \'image\': \'https://p0.meituan.net/moviemachine/409aca94fa1695a6bdb5206735189c11495127.jpg@160w_220h_1e_1c\', \'title\': \'下一任:前任\', \'actor\': \'郭采洁,郑恺,李东学\', \'release time\': \'2019-05-01\', \'Real-time box office\': \'.万\', \'Total box office\': \'.万\'}
{\'index\': \'5\', \'image\': \'https://p1.meituan.net/movie/c63849c7a9de360a7b192bc322792a111705236.jpg@160w_220h_1e_1c\', \'title\': \'反贪风暴4\', \'actor\': \'古天乐,郑嘉颖,林峯\', \'release time\': \'2019-04-04\', \'Real-time box office\': \'.万\', \'Total box office\': \'.亿\'}
{\'index\': \'6\', \'image\': \'https://p0.meituan.net/moviemachine/90258899534b9cca44f2e9b9a6246504248749.jpg@160w_220h_1e_1c\', \'title\': \'动物出击\', \'actor\': \'景熙童\', \'release time\': \'2019-04-30\', \'Real-time box office\': \'.万\', \'Total box office\': \'.万\'}
{\'index\': \'7\', \'image\': \'https://p0.meituan.net/movie/eda6595dc2c3a5d7cdda5eb4f8d8b1982460902.jpg@160w_220h_1e_1c\', \'title\': \'撞死了一只羊\', \'actor\': \'金巴,更登彭措,索朗旺姆\', \'release time\': \'2019-04-26\', \'Real-time box office\': \'.万\', \'Total box office\': \'.万\'}
{\'index\': \'8\', \'image\': \'https://p0.meituan.net/movie/29caaa1b66c95807a3f4d29b5b03644b1876102.jpg@160w_220h_1e_1c\', \'title\': \'调音师\', \'actor\': \'阿尤斯曼·库拉纳,塔布,拉迪卡·艾普特\', \'release time\': \'2019-04-03\', \'Real-time box office\': \'.万\', \'Total box office\': \'.亿\'}
{\'index\': \'9\', \'image\': \'https://p0.meituan.net/movie/0253cac859838e4fd6ae94cf986b07971008254.jpg@160w_220h_1e_1c\', \'title\': \'神奇乐园历险记\', \'actor\': \'索菲亚·玛丽,詹妮弗·加纳,肯·哈德森·坎贝尔\', \'release time\': \'2019-04-19\', \'Real-time box office\': \'.万\', \'Total box office\': \'.万\'}
{\'index\': \'10\', \'image\': \'https://p0.meituan.net/movie/86aba43e286ed044a544a75748d08aca3798593.jpg@160w_220h_1e_1c\', \'title\': \'天上再见\', \'actor\': \'纳威尔·佩雷兹·毕斯卡亚特,阿尔贝·杜邦泰尔,艾米莉·德奎恩\', \'release time\': \'2019-04-30\', \'Real-time box office\': \'.万\', \'Total box office\': \'.万\'}
5
不懂数字为什么都以&#x开头,so 并没有完成我的目标,ing~
记于20190818,前几天在看别人的爬虫时,看到别人提及了这种反爬技术,所以,我接下来要更新这篇文章啦!虽然妇联已经过去了很久,最近哪吒比较火