在数据库中建库建表
# 连接到mysql数据库 mysql -h127.0.0.1 -uroot -p123456 # 建库建表 create database maoyandb charset utf8; use maoyandb; create table filmtab( name varchar(100), star varchar(300), time varchar(50) )charset=utf8;
回顾pymysql基本使用
import pymysql # 创建2个对象 db = pymysql.connect(\'localhost\',\'root\',\'123456\',\'maoyandb\',charset=\'utf8\') cursor = db.cursor() # 执行SQL命令并提交到数据库执行 # execute()方法第二个参数为列表传参补位 ins = \'insert into filmtab values(%s,%s,%s)\' cursor.execute(ins,[\'霸王别姬\',\'张国荣\',\'1993\']) db.commit() # 关闭 cursor.close() db.close()
来试试高效的executemany()方法?
import pymysql # 创建2个对象 db = pymysql.connect(\'192.168.153.137\',\'tiger\',\'123456\',\'maoyandb\',charset=\'utf8\') cursor = db.cursor() # 抓取的数据 film_list = [(\'月光宝盒\',\'周星驰\',\'1994\'),(\'大圣娶亲\',\'周星驰\',\'1994\')] # 执行SQL命令并提交到数据库执行 # execute()方法第二个参数为列表传参补位 cursor.executemany(\'insert into filmtab values(%s,%s,%s)\',film_list) db.commit() # 关闭 cursor.close() db.close()
将电影信息存入MySQL数据库(尽量使用executemany方法)
from urllib import request import re import time import random from useragents import ua_list import pymysql class MaoyanSpider(object): def __init__(self): self.url = \'https://maoyan.com/board/4?offset={}\' # 计数 self.num = 0 # 创建2个对象 self.db = pymysql.connect( \'localhost\',\'root\',\'123456\',\'maoyandb\',charset=\'utf8\' ) self.cursor = self.db.cursor() def get_html(self,url): headers = { \'User-Agent\' : random.choice(ua_list) } req = request.Request(url=url,headers=headers) res = request.urlopen(req) html = res.read().decode(\'utf-8\') # 直接调用解析函数 self.parse_html(html) def parse_html(self,html): re_bds = r\'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>\' pattern = re.compile(re_bds,re.S) # film_list: [(\'霸王别姬\',\'张国荣\',\'1993\'),()] film_list = pattern.findall(html) # 直接调用写入函数 self.write_html(film_list) # # mysql - execute() # def write_html(self,film_list): # ins = \'insert into filmtab values(%s,%s,%s)\' # for film in film_list: # L = [ # film[0].strip(), # film[1].strip(), # film[2].strip()[5:15] # ] # self.cursor.execute(ins,L) # # 千万别忘了提交到数据库执行 # self.db.commit() # mysql - executemany([(),(),()]) def write_html(self, film_list): L = [] ins = \'insert into filmtab values(%s,%s,%s)\' for film in film_list: t = ( film[0].strip(), film[1].strip(), film[2].strip()[5:15] ) L.append(t) self.cursor.executemany(ins, L) # 千万别忘了提交到数据库执行 self.db.commit() def main(self): for offset in range(0,31,10): url = self.url.format(offset) self.get_html(url) time.sleep(random.randint(1,2)) # 断开数据库连接 self.cursor.close() self.db.close() if __name__ == \'__main__\': start = time.time() spider = MaoyanSpider() spider.main() end = time.time() print(\'执行时间:%.2f\' % (end-start))
做个SQL查询
1、查询20年以前的电影的名字和上映时间 select name,time from filmtab where time<(now()-interval 20 year); 2、查询1990-2000年的电影名字和上映时间 select name,time from filmtab where time>=\'1990-01-01\' and time<=\'2000-12-31\';