特别鸣谢阮思绮同学!虽然感觉这个冷冷的博客也没人看23333
import mysql.connector import sys, os import urllib.request import re import itertools user = \'root\' pwd = \'\' host = \'127.0.0.1\' db = \'test\' data_file = \'wooyun.dat\' create_table_sql = "CREATE TABLE IF NOT EXISTS mytable (id int(10) AUTO_INCREMENT PRIMARY KEY, type varchar(300) , info varchar(1000) , detail varchar(5000) , repair varchar(1000) )CHARACTER SET utf8" insert_sql = "INSERT INTO mytable (type, info, detail, repair) VALUES ( %s, %s, %s, %s)" select_sql = "SELECT id, type, info, detail, repair FROM mytable" cnx = mysql.connector.connect(user=user, password=pwd, host=host, database=db) cursor = cnx.cursor() def create_table_sql_api(a): try: cursor.execute(a) except mysql.connector.Error as err: print("create table \'mytable\' failed.") print("Error: {}".format(err.msg)) sys.exit() def insert_sql_api(a,b): try: cursor.execute(a,b) except mysql.connector.Error as err: print("insert table \'mytable\' failed.") print("Error: {}".format(err.msg)) sys.exit() def select_sql_api(a): try: cursor.execute(a) for (id, type, info, detail, repair) in cursor: print("ID:{} type:{} info:{} repair:{}".format(id, type, info, detail, repair)) except mysql.connector.Error as err: print("query table \'mytable\' failed.") print("Error: {}".format(err.msg)) sys.exit() def get_html_response(url): html_response = urllib.request.urlopen(url).read().decode(\'utf-8\') return html_response def geturl(starturl): a=get_html_response(starturl) childurl=(re.findall(r\'/bugs/wooyun-\w*-\w*\b\',a)) return childurl def get_nextpage(starturl): d=get_html_response(starturl) nextpage=(re.findall(r\'searchbug.php\?q=6YeR6J6N&pNO=\w\',d)) return nextpage starturl="http://www.wooyun.org/searchbug.php?q=6YeR6J6N" result=[] final=[] type_wooyun_n=[] info_n=[] detail_n=[] repair_n=[] #output=open("D:\\wooyun.csv","w+") create_table_sql_api(create_table_sql) for i in get_nextpage(starturl): result+=geturl(\'http://wooyun.org/\'+re.sub(\'金融\',\'6YeR6J6N\',i)) #扫描各种漏洞的url地址放入result中 result=set(result)#去除result中重复的地址 for i in result: k=get_html_response(\'http://wooyun.org/\'+re.sub(\'金融\',\'%E9%87%91%E8%9E%8D\',i))#下载页面到k type_wooyun=re.findall(r\'漏洞类型:.*.</h3>\',k) info=re.findall(r\'<h3>\w*:.*.</h3>\',k)#空白字符用/s,寻找所有适用于<h3>标签的文字 detail=re.findall(r\'<p class="detail">.*.</p>\',k) repair=re.findall(r\'修复方案:</h3>\s*<p class="detail">.*.\s*</p>\',k) for j in type_wooyun:#漏洞类型,为之后进行数据库分类做准备 j=re.sub(r\':\s\',\':\',j) j=re.sub(r\'\t\',\'\',j) j=re.sub(r\'</h3>\',\'\',j) type_wooyun_n+=j for j in info:#处理概要 j=re.sub(r\':\s\',\':\',j) j=re.sub(r\'<h3>\',\'\',j) j=re.sub(r\'</h3>\',\'\',j) j=re.sub(r\'<a\shref=".*.">\',\'\',j) j=re.sub(r\'</a>\',\'\',j) j=re.sub(r\'<imgheight=".*./>\',\'\',j) j=j.split() info_n+=j for j in detail:#处理详情 j=re.sub(r\':\s\',\':\',j) j=re.sub(r\'<p\sclass="detail">\',\'\',j) j=re.sub(r\'</p>\',\'\',j) j=re.sub(r\'"\starget="_blank"><img\ssrc="/upload/.*.width="600"/></a>\',\',\',j) j=re.sub(r\'<a href="\',\' http://www.wooyun.org\',j) j=re.sub(r\'对本漏洞信息进行评价,.*.备学习价值\',\'\',j) detail_n+=j for j in repair:#处理回复方法 j=re.sub(r\'</br>\',\',\',j) j=re.sub(r\'</p>\',\',\',j) j=re.sub(r\'</h3>\',\',\',j) j=re.sub(r\'<p\sclass="detail">\',\'\',j) j=re.sub(r\':\',\':\',j) j=j.split() repair_n+=j type_wooyun_str="".join(itertools.chain(*type_wooyun_n)) info_str="".join(itertools.chain(*info_n)) detail_str="".join(itertools.chain(*detail_n)) repair_str="".join(itertools.chain(*repair_n)) final.append(type_wooyun_str) final.append(info_str) final.append(detail_str) final.append(repair_str) insert_sql_api(insert_sql,tuple(final)) select_sql_api(select_sql) #output.writelines(final) #output.writelines(\'\n\n\') final.clear() repair_n.clear() info_n.clear() type_wooyun_n.clear() detail_n.clear() cnx.commit() cursor.close() cnx.close() #output.close()