【发布时间】:2015-03-18 01:53:02
【问题描述】:
我用 Python3 写了一个简单的脚本。它枚举 POST 请求的所有可能输入。我遇到的问题是创建完所有线程后内存一直在增长,最后会因为内存不足而被系统杀死。我使用 Pimpler 检查了 myThread 类。结果表明,myThread 的所有实例的内存使用量并没有迅速增加。我不知道是什么导致了这种内存泄漏。
import requests
import threading
import time
class myThread(threading.Thread):
def __init__(self, threadID, name, st, ed):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.st = st
self.ed = ed
def run(self):
print("Starting "+self.name)
get_range(self.st, self.ed)
print("Exiting " + self.name)
def get_by_id(n):
payload = {"id":n}
url = "http://www.example.com" # This is for example
headers = { 'Content-Type': 'application/x-www-form-urlencoded',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
'Accept-Encoding':"gzip, deflate",
}
try:
r = requests.post(url, data=payload, headers=headers)
except Exception as e:
return -2
if r.status_code is not 200:
return -2
if "Cannot find" in r.text:
return -1
else:
with open(os.path.join("./pages", n), 'w') as f:
f.write(r.text)
return 1
def get_range(a, b):
for i in range(a, b):
r = get_by_id(str(i))
if __name__ == "__main__":
threads = []
for x in range(20):
threads.append(myThread(x, "Thread-"+str(x), 800000000000+x*4000, 800000000000+(x+1)*4000))
threads[-1].start()
time.sleep(0.3)
for t in threads:
t.join()
print("Exiting Main")
以下是删除所有可能导致内存问题的文件操作后的代码。
import requests
import threading
import time
class myThread(threading.Thread):
def __init__(self, threadID, name, st, ed):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.st = st
self.ed = ed
def run(self):
print("Starting "+self.name)
get_range(self.st, self.ed)
print("Exiting " + self.name)
def get_by_id(n):
payload = {"id":n}
url = "http://www.example.com" # This is for example
headers = { 'Content-Type': 'application/x-www-form-urlencoded',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
'Accept-Encoding':"gzip, deflate",
}
try:
r = requests.post(url, data=payload, headers=headers)
except Exception as e:
return -2
if r.status_code is not 200:
return -2
if "Cannot find" in r.text:
return -1
else:
return 1
def get_range(a, b):
for i in range(a, b):
r = get_by_id(str(i))
if __name__ == "__main__":
threads = []
for x in range(20):
threads.append(myThread(x, "Thread-"+str(x), 800000000000+x*4000, 800000000000+(x+1)*4000))
threads[-1].start()
time.sleep(0.3)
for t in threads:
t.join()
print("Exiting Main")
【问题讨论】:
标签: python multithreading web-crawler