google image
google图片抓取
google图片是base64加密的,而且base64后的信息放在script信息里面
import pymysql
from lxml import etree
import logging
import requests
import time
import threading
from threading import RLock
import re
import os
lock = RLock()
import base64
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# 添加日志
logging.basicConfig(
level=logging.INFO, # 定义输出到文件的log级别,大于此级别的都被输出
format=\'%(asctime)s %(filename)s %(levelname)s : %(message)s\', # 定义输出log的格式
datefmt=\'%Y-%m-%d %H:%M:%S\', # 时间
filename=\'drugimagesError.log\', # log文件名
filemode=\'a\') # 写入模式“w”或“a”
class google_images(object):
def __init__(self):
self.strat_record = 1
self.end_record = 10000001
self.db = pymysql.connect(host=\'localhost\', port=3306, database=\'yao_zhi\', user=\'root\', password=\'root\',
charset=\'utf8\')
self.cursor = self.db.cursor()
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
while True:
self.parse_page()
def parse_page(self):
lock.acquire()
num = self.cursor.execute(
"select id, me_pizhunwenhao, me_name, me_jixing, me_key from guo_cai_jin_kou_yao_pin where id > {} limit 1000".format(
self.strat_record))
lock.release()
if str(num) == str(0):
exit()
data_tuple = self.cursor.fetchall()
threading_list = []
for data_one in data_tuple:
id = data_one[0]
approvalNumber = data_one[1]
drugName = data_one[2]
dosageForm = data_one[3]
try:
specifications = re.findall(r".+?,", data_one[4])[0]
except:
specifications = data_one[4]
self.strat_record = id
logging.info("id:%s approvalNumber:%s drugName:%s dosageForm:%s specifications:%s" % (
id, approvalNumber, drugName, dosageForm, specifications))
print("id:%s approvalNumber:%s drugName:%s dosageForm:%s specifications:%s" % (
id, approvalNumber, drugName, dosageForm, specifications))
if str(id) == str(self.end_record):
exit()
lock.acquire()
num = self.cursor.execute("select id from drugimages where approvalNumber = \'{}\' ".format(approvalNumber))
lock.release()
if not num:
t = threading.Thread(target=self.parse_page_data,
args=(id, approvalNumber, drugName, dosageForm, specifications,))
t.start()
threading_list.append(t)
time.sleep(3)
for t_one in threading_list:
t_one.join()
def parse_page_data(self, id, approvalNumber, drugName, dosageForm, specifications):
print("id:%s approvalNumber:%s drugName:%s specifications:%s" % (
id, approvalNumber, drugName, specifications))
keyword = drugName + \' \' + dosageForm + \' \' + specifications
url = \'https://www.google.com/search?biw=1920&bih=900&tbm=isch&q=%s\' % keyword
print(url)
data_particular = etree.HTML(requests.get(url=url, headers=self.headers).content)
images_list = data_particular.xpath(\'//span[@id="xjs"]/script/text()\')[0]
images_list_link = re.findall(r\'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD.*"\]?\', images_list)
num = 0
for link in images_list_link:
num += 1
url_link = link.replace(\'"]\', \'\')
image = url_link.encode(\'utf-8\').decode(\'unicode_escape\')
image_data = image.replace(\'data:image/jpeg;base64,\', \'\')
drugsql = "insert into drugimages(approvalNumber, drugName, specifications, image, num, durgid) values(\'{}\', \'{}\', \'{}\', \'{}\', {}, {})"
drugsql_data = drugsql.format(approvalNumber, drugName, specifications, image, int(num), int(id))
print(\'sql_data:%s\' % drugsql_data)
logging.info("id:%s approvalNumber:%s drugName:%s specifications:%s" % (
id, approvalNumber, drugName, specifications))
lock.acquire()
self.cursor.execute(drugsql_data)
self.db.commit()
lock.release()
pic_content = base64.b64decode(image_data)
page_id = int(id / 1000)
file = \'./images/\' + \'page\' + str(page_id) + \'/\'
if not os.path.exists(file):
os.makedirs(file)
files = file + \'id\' + str(id) + \'/\'
if not os.path.exists(files):
os.makedirs(files)
file = open(files + str(approvalNumber) + \'-\' + str(num) + \'.jpg\', \'wb\')
file.write(pic_content)
file.close()
if str(num) == str(30):
break
if __name__ == \'__main__\':
google_images()