一、线程回顾
import time import threading \'\'\'一、一个主线程\'\'\' # def sing(): # for i in range(1,6): # print(\'come baby 跟我一起 嗨 嗨 嗨 !!!\') # time.sleep(1) # # def dance(): # for i in range(1,6): # print(\'恰恰 肚皮 钢管舞 哈哈哈哈哈 ...... \') # time.sleep(1) # def main(): # sing() # dance() # if __name__ == \'__main__\': # main() \'\'\'二、面向过程创建线程:一个主线程,两个子线程\'\'\' # def sing(a): # for i in range(1,6): # print(\'当前线程:%s ...come %s 跟我一起 嗨 嗨 嗨 !!!\' %(threading.current_thread().name,a)) # time.sleep(1) # def dance(a): # for i in range(1,6): # print(\'当前线程:%s ... 恰恰 肚皮 钢管舞 %s你要哪一种 \' %(threading.current_thread().name,a)) # time.sleep(1) # def main(): # print(\'...联欢晚会现在开始...\') # #创建唱歌线程 # a = \'悟空\' # t_sing = threading.Thread(target=sing,name=\'唱歌\',args=(a,)) # # # 创建跳舞线程 # t_dance = threading.Thread(target=dance, name=\'跳舞\',args=(a,)) # # #启动线程 # t_sing.start() # t_dance.start() # # #让主线程等待子线程执行完毕 # t_sing.join() # t_dance.join() # # print(\'晚会结束,各回各家\') # if __name__ == \'__main__\': # main() \'\'\'三、面向对象创建线程\'\'\' #写一个类,继承threading.Thread class SingThread(threading.Thread): def __init__(self,name,a): super().__init__() self.name = name self.a = a def run(self): print("线程名:%s 参数:%s" %(self.name,self.a)) for i in range(1, 6): print(\'爱江山更爱美人...\') time.sleep(1) class DanceThread(threading.Thread): def __init__(self, name, a): super().__init__() self.name = name self.a = a def run(self): print("线程名:%s 参数:%s" % (self.name, self.a)) for i in range(1, 6): print(\'蹦擦擦,蹦擦擦...\') time.sleep(1) def main(): #创建线程 t_sing = SingThread(\'唱\',\'八戒\') t_dance = DanceThread(\'跳\',\'悟能\') #启动线程 t_sing.start() t_dance.start() #让主线程等待子线程执行完毕 t_sing.join() t_dance.join() if __name__ == \'__main__\': main()
二、队列
from queue import Queue #创建队列 q = Queue(5) #5个位子 print(q.empty()) #判断是否为空 #存入数据 q.put(\'浓眉哥\') q.put(\'勒布朗\') q.put(\'丹尼*格林\') q.put(\'库兹马\') q.put(\'麦基\') print(q.full()) #判断是否满 print(q.qsize()) #返回队列大小 # q.put(\'波普\',False) #如果队列满了,直接报错 # q.put(\'波普\',True,3) #如果队列满了,等待3秒还没有空位,报错 #获取数据:先进先出 print(q.get()) print(q.get()) print(q.get()) print(q.get()) print(q.get()) # q.get(\'波普\',False) #如果队列为空,直接报错 # q.get(\'波普\',True,3) #如果队列为空,等待3秒还是空,报错
三、多线程爬虫
import time import threading from queue import Queue import requests from lxml import etree import json #存放采集线程 crawl_thread_list = [] #存放解析线程 parse_thread_list = [] def create_queue(): #创建页码队列 page_queue = Queue() for page in range(1,6): page_queue.put(page) # 创建内容队列 data_queue = Queue() return page_queue,data_queue class CrawlThread(threading.Thread): def __init__(self,name,page_queue,data_queue): super(CrawlThread,self).__init__() self.name = name self.page_queue = page_queue self.data_queue = data_queue self.url = \'http://www.fanjian.net/jiantu-{}\' self.headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.3; Win64; x64) \' \'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36\'} def run(self): print(\'%s启动......\' %self.name) while 1: if self.page_queue.empty(): break #从队列取出页码 page = self.page_queue.get() #拼接url url = self.url.format(page) #发送请求,拿到响应 r = requests.get(url=url,headers=self.headers) #将响应内容放到data_queue self.data_queue.put(r.text) break print(\'%s结束......\' % self.name) class ParseThread(threading.Thread): def __init__(self,name,data_queue,fp,lock): super(ParseThread, self).__init__() self.name = name self.data_queue = data_queue self.fp = fp self.lock = lock def parse_content(self,data): tree = etree.HTML(data) \'\'\'先查找所有的li,再从li下查找图片标题和src\'\'\' li_list = tree.xpath(\'//ul[@class="cont-list"]/li\') items = [] for l in li_list: # 获取图片标题 img_title = l.xpath(\'//h2/a/text()\')[0] #获取图片url img_url = tree.xpath(\'//div[@class="cont-list-main"]/p/img/@data-src\')[0] item = {\'标题\',img_title, \'链接\',img_url} items.append(item) #写入文件 self.lock.acquire() #上锁 for item in items: self.fp.write(str(item)) self.lock.release() #解锁 def run(self): while 1: print(\'%s启动......\' % self.name) #从data_queue中取出一页数据 data = self.data_queue.get() #解析内容 self.parse_content(data) def create_crawl_thread(page_queue,data_queue): crawl_name = [\'采集1号\',\'采集2号\',\'采集3号\'] for name in crawl_name: #创建子线程 t_crawl = CrawlThread(name,page_queue,data_queue) #保存到列表 crawl_thread_list.append(t_crawl) def create_parse_thread(data_queue,fp,lock): parse_name = [\'解析1号\', \'解析2号\', \'解析3号\'] for name in parse_name: # 创建子线程 t_parse = ParseThread(name,data_queue,fp,lock) # 保存到列表 parse_thread_list.append(t_parse) def main(): # 创建队列 page_queue,data_queue = create_queue() #打开一个文件 fp = open(\'jiantu.txt\',\'a\',encoding=\'utf8\') #创建锁 lock = threading.Lock() #创建采集线程 create_crawl_thread(page_queue,data_queue) #创建解析线程 create_parse_thread(data_queue,fp,lock) # 启动采集线程 for t in crawl_thread_list: t.start() # 启动解析线程 for t in parse_thread_list: t.start() # 让主线程等待子线程执行完毕 for t in crawl_thread_list: t.join() for t in parse_thread_list: t.join() #关闭文件 fp.close() print(\'主线程执行完毕!\') if __name__ == \'__main__\': main()