和朋友在QQ上聊天感觉呀没有激情,突然,突发奇想,我写个小的爬虫 ,把表情包爬取下来随便挑,斗到他们吐血。
      
                多线程爬取斗图表情包



下面是爬取斗图的代码,代码可供参考

#encoding:utf8
#模块
import re
import requests
from lxml import etree
import os
import random
import threading
import time
import hashlib

def makemake(path):
path = re.sub('\,|\:|\?|\!','',path)
path = path.replace(' ','')
#替换
if os.path.isdir('F:\\11\\斗图\\'+path):
#创建路径
print(path+'已经存在')
else:
print('开始创建'+path)
zhi = os.makedirs('F:\\11\\斗图\\'+path)
# print(zhi)

def make_files(path,source):
path = re.sub('\,|\:|\?|\!','',path)
path = path.replace(' ','')
#替换
if os.path.isfile('F:\\11\\斗图\\'+path):
#判断路径是否存在,如果存在就直接打印
print(path+'已经存在')
else:
#循环网页
while True:
n = 0
n = n + 1
try:
source = requests.get('http://'+source).content
break
except:
print('http://'+source+'连接出错正在重试当前次数:'+str(n))
time.sleep(1)
if n>6:
print('放弃http://'+source)
break


print('正在下载'+path)
file = open('F:\\11\\斗图\\'+path,'wb')
#打开文件夹
file.write(source)
#写入
file.close()
#关闭


def start_spider(g):
print('当前下载页数为:' + str(g))
while True:
n = 1
try:
yuan = requests.get('https://www.doutula.com/article/list/?page=' + str(g)).text
break
except:
print('https://www.doutula.com/article/list/?page=' + str(g) + '连接出错正在重试当前次数:' + str(n))
time.sleep(1)
n = n + 1

lists = etree.HTML(yuan).xpath('//*[@>'完成')



相关文章: