6min

一般先把下载任务先放到待处理队列中,也可以定时填充任务到待处理队列 ; 

 一个线程判断任务完成情况,完成后处理下载结果;

  开多个工作线程从队列中获取下载任务,完成后放到已完成队列中,失败要放回待处理;

# _*_ coding: utf-8 _*_
import requests
from threading import Thread
from queue import Queue
import time, json, math
import sys, os
import pymysql

req_queue  = Queue(maxsize=30000)
res_queue  = Queue()

class pagest():
    max_page   = 1
    username   = \'test\'
    password   = \'123\'
    date_start = \'2019-05-01\'
    date_end   = \'2019-05-05\'
    cookies    = {}

class maintd(Thread):
    def __init__(self):
        Thread.__init__(self)

    def run(self):
        while True:
            if res_queue.qsize() == pagest.max_page:
                bigfile()
                break
            time.sleep(10)


class worker(Thread):
    def __init__(self):
        Thread.__init__(self)

    def run(self):
        while True:
            if req_queue.empty():
                break
            page = req_queue.get(block=True, timeout=30)
            isok = parser(page)
            if isok == True:
                res_queue.put(page)
            else:
                req_queue.put(page)
                time.sleep(5)


def parser(page):
    sql = []
    try:
        url = \'http://a.b.c/log.html?page=%s&date_start=%s&date_end=%s\' % (page, pagest.date_start, pagest.date_end)
        res = requests.get(url, cookies=pagest.cookies)
        user = json.loads(res.text)
        f = open("dt/%s.txt" % page, "w")
                f.write(res.text+\'\n\')
                print(obj[\'phone\'])
        f.close()
    except Exception as e:
        print(e)
        return False
    return True


def login():
    url = \'http://a.b.c/login.html\'
    headers = {
        \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36\'
    }
    res = requests.get(url, headers=headers)
    sessid = res.cookies.get(\'PHPSESSID\')
    param = {
        \'username\': pagest.username,
        \'password\': pagest.password
    }
    res = requests.post(url, headers=headers, cookies={\'PHPSESSID\': sessid}, data=param)
    pagest.cookies = {
        \'PHPSESSID\': sessid
    }

    url = \'http://a.b.c/log.html?page=1&date_start=%s&date_end=%s\' % (pagest.date_start, pagest.date_end)
    res = requests.get(url, cookies=pagest.cookies)
    cnt = res.json().get(\'count\')
    pagest.max_page = math.ceil(int(cnt) / 100)

def init():
    path = os.path.realpath(sys.path[0])+\'\dt\'
    if not os.path.exists(path):
        os.mkdir(path)
    ls = os.listdir(path)
    for l in ls:
        fp = os.path.join(path, l)
        os.remove(fp)

def bigfile():
    i = 1
    bf = open("dt/all.txt", "a")
    while i <= pagest.max_page:
        f = open("dt/%s.txt" % i, "r")
        bf.write(f.read())
        f.close()
        i += 1
    bf.close()

def main():
    init()
    login()
    td = maintd()
    td.start()

    for i in range(pagest.max_page):
        req_queue.put(int(i+1))

    worker_list = []
    for i in range(30):
        item = worker()
        worker_list.append(item)

    for li in worker_list:
        li.start()
    for li in worker_list:
        li.join()

    td.join()

if __name__ == \'__main__\':
    main()

                            

 

分类:

技术点:

相关文章: