最近做了测试抓取XX时报的数据,由于需要事先登录,并且有验证码,关于验证码解决有两个途径:一是利用打码平台,其原理是把验证码的图片上传发送给打码平台,

  然后返回其验证码。二就是自己研究验证码技术问题。这个有时间再研究。

    目前主要是测试从XX时报抓取数据,目前暂时用了笨方法,利用人工介入,输入验证码。

    登录界面:

    python3 IEDriver抓取时报数据

 

    具体代码如下:

  

#coding=utf-8
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
import collections
import mongoDbBase
import numpy
import imagehash
from PIL import Image,ImageFile
import datetime
class finalNews_IE:
    def __init__(self,strdate,logonUrl,firstUrl,keyword_list,exportPath,codedir):
        self.iniDriver()
        self.db = mongoDbBase.mongoDbBase()
        self.date = strdate
        self.firstUrl = firstUrl
        self.logonUrl = logonUrl
        self.keyword_list = keyword_list
        self.exportPath = exportPath
        self.codedir = codedir
        self.hash_code_dict ={}


    def iniDriver(self):
        # 通过配置文件获取IEDriverServer.exe路径
        IEDriverServer = "C:\Program Files\Internet Explorer\IEDriverServer.exe"
        os.environ["webdriver.ie.driver"] = IEDriverServer
        self.driver = webdriver.Ie(IEDriverServer)

    def WriteData(self, message, fileName):
        fileName = os.path.join(os.getcwd(), self.exportPath + '/' + fileName)
        with open(fileName, 'a') as f:
            f.write(message)

    # 获取图片文件的hash值
    def get_ImageHash(self,imagefile):
        hash = None
        if os.path.exists(imagefile):
            with open(imagefile, 'rb') as fp:
                hash = imagehash.average_hash(Image.open(fp))
        return hash

    # 点降噪
    def clearNoise(self, imageFile, x=0, y=0):
        if os.path.exists(imageFile):
            image = Image.open(imageFile)
            image = image.convert('L')
            image = numpy.asarray(image)
            image = (image > 135) * 255
            image = Image.fromarray(image).convert('RGB')
            # save_name = "D:\work\python36_crawl\Veriycode\mode_5590.png"
            # image.save(save_name)
            image.save(imageFile)
            return image

    #切割验证码
    # rownum:切割行数;colnum:切割列数;dstpath:图片文件路径;img_name:要切割的图片文件
    def splitimage(self, imagePath,imageFile,rownum=1, colnum=4):
        img = Image.open(imageFile)
        w, h = img.size
        if rownum <= h and colnum <= w:
            print('Original image info: %sx%s, %s, %s' % (w, h, img.format, img.mode))
            print('开始处理图片切割, 请稍候...')

            s = os.path.split(imageFile)
            if imagePath == '':
                dstpath = s[0]
            fn = s[1].split('.')
            basename = fn[0]
            ext = fn[-1]

            num = 1
            rowheight = h // rownum
            colwidth = w // colnum
            file_list =[]
            for r in range(rownum):
                index = 0
                for c in range(colnum):
                    # (left, upper, right, lower)
                    # box = (c * colwidth, r * rowheight, (c + 1) * colwidth, (r + 1) * rowheight)
                    if index < 1:
                        colwid = colwidth + 6
                    elif index < 2:
                        colwid = colwidth + 1
                    elif index < 3:
                        colwid = colwidth

                    box = (c * colwid, r * rowheight, (c + 1) * colwid, (r + 1) * rowheight)
                    newfile = os.path.join(imagePath, basename + '_' + str(num) + '.' + ext)
                    file_list.append(newfile)
                    img.crop(box).save(newfile, ext)
                    num = num + 1
                    index += 1
            return file_list

    def compare_image_with_hash(self, image_hash1,image_hash2, max_dif=5):
        """
                max_dif: 允许最大hash差值, 越小越精确,最小为0
                推荐使用
                """
        dif = image_hash1 - image_hash2
        # print(dif)
        if dif < 0:
            dif = -dif
        if dif <= max_dif:
            return True
        else:
            return False

    # 截取验证码图片
    def savePicture(self):
        # self.driver.get(self.logonUrl)
        # self.driver.maximize_window()
        # time.sleep(2)

        self.driver.save_screenshot(self.codedir +"\Temp.png")
        checkcode = self.driver.find_element_by_id("checkcode")
        location = checkcode.location  # 获取验证码x,y轴坐标
        size = checkcode.size  # 获取验证码的长宽
        rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
                  int(location['y'] + size['height']))  # 写成我们需要截取的位置坐标
        i = Image.open(self.codedir +"\Temp.png")  # 打开截图
        result = i.crop(rangle)  # 使用Image的crop函数,从截图中再次截取我们需要的区域
        filename = datetime.datetime.now().strftime("%M%S")
        filename =self.codedir +"\Temp_code.png"
        result.save(filename)
        self.clearNoise(filename)
        file_list = self.splitimage(self.codedir,filename)
        time.sleep(3)
        verycode =''
        for f in file_list:
            imageHash = self.get_ImageHash(f)
            if imageHash:
                for h, code in self.hash_code_dict.items():
                    flag = self.compare_image_with_hash(imageHash, h, 0)
                    if flag:
                        # print(code)
                        verycode += code
                        break

        print(verycode)
        return verycode
        # print(verycode)
        # self.driver.close()

    def getVerycode(self, txtFile="verycode.txt"):
        f = open(txtFile, 'r')
        result = f.read()
        return result

    def longon(self):
        for f in range(0,10):
            for l in range(1,5):
                file = os.path.join(self.codedir, "codeLibrary\code" +  str(f) + '_'+str(l) + ".png")
                # print(file)
                hash = self.get_ImageHash(file)
                self.hash_code_dict[hash]= str(f)

        flag = True
        try:
            self.driver.get(self.logonUrl)
            self.driver.maximize_window()
            time.sleep(2)
            verycode = self.savePicture()
            if len(verycode)==4:
                accname = self.driver.find_element_by_id("username")
                # accname = self.driver.find_element_by_id("//input[@id='username']")
                accname.send_keys('ctrchina')

                accpwd = self.driver.find_element_by_id("password")
                # accpwd.send_keys('123456')
                # code = self.getVerycode()
                checkcode = self.driver.find_element_by_name("checkcode")
                checkcode.send_keys(verycode)
                submit = self.driver.find_element_by_name("button")
                submit.click()
            else:
                flag = False
        except Exception as e1:
            message = str(e1.args)
            flag = False
        return flag

    # 获取版面链接及关键字
    def saveUrls(self):
        error = ''
        while True:
            flag = self.longon()
            time.sleep(2)
            if flag:
                try:
                    codefault = self.driver.find_element_by_xpath("//table[@class='table_login']/tbody/tr/td/font")
                    if codefault:
                        continue
                except Exception as e1:
                    pass
                break
        try:
            time.sleep(2)
            self.driver.get(self.firstUrl)
            self.driver.maximize_window()
            # urllb = "//div[@id='pageLink']/ul/div/div/a"
            urllb = "//a[@id='pageLink']"
            time.sleep(2)
            elements = self.driver.find_elements_by_xpath(urllb)
            url_layout_dict = collections.OrderedDict()
            for element in elements:
                layout = element.text
                # print(layout)
                if len(layout) == 0:
                    continue
                # layout = txt[txt.find(":") + 1:]
                link =  element.get_attribute("href")
                print(link)
                if link not in url_layout_dict:
                    url_layout_dict[link] = layout
            index = 0
            for sub_url,layout in url_layout_dict.items():
                if index==0:
                    sub_url=""
                print(index)
                self.getArticleLink(sub_url,layout)
                index+=1
        except Exception as e1:
            print("saveUrlsException")
            print("saveUrlsException:Exception" + str(e1.args))


    def getArticleLink(self, url,layout):
        error = ''
        try:
            if url:
                self.driver.get(url)
                self.driver.maximize_window()
            time.sleep(2)
            dt = datetime.datetime.now().strftime("%Y.%m.%d")
            urllb = "//div[@id='titleList']/ul/li/a"
            elements = self.driver.find_elements_by_xpath(urllb)
            url_layout_dict = {}
            for element in elements:
                txt = element.text
                txt = txt[txt.rfind(")") + 1:len(txt)]
                if txt.find("无标题") > -1 or txt.find("公 告") > -1 or txt.find("FINANCIAL NEWS") > -1 or txt.find(dt) > -1:
                    continue
                link = element.get_attribute("href")
                print(link)
                url_layout_dict[link] = layout
            self.db.SavefinalUrl(url_layout_dict,self.date)
        except Exception as e1:
            print("getArticleLink:Exception")
            print("getArticleLink:Exception" + str(e1.args))
            error = e1.args


    def catchdata(self):

        rows = self.db.GetfinalUrl(self.date)
        lst = []
        for row in rows:
            lst.append(row)
        print("rowcount:"+str(len(lst)))
        count =1
        for row in lst:
            url = row['url']
            layout = row['layout']
            try:
                self.driver.get(url)
                self.driver.maximize_window()
                time.sleep(1)
                title = ""

                # t1 = doc("div[class='text_c']")
                element = self.driver.find_element_by_class_name("text_c")
                title = element.find_element_by_css_selector("h3").text
                st = element.find_element_by_css_selector("h1").text
                if st:
                    title += "\n" + st
                st = element.find_element_by_css_selector("h2").text
                if st:
                    title += "\n" + st

                st = element.find_element_by_css_selector("h4").text
                if st:
                    if st.find("记者") == -1:
                        title += "\n" + st
                    # else:
                    #     author = st.replace("记者","").replace("本报","").strip()
                elements = self.driver.find_elements_by_xpath("//div[@id='ozoom']/p")

                content = ""

                key = ""
                index = 0
                author = ''
                for element in elements:
                    txt = element.text.strip().replace("\n", "")
                    content += txt
                    if index == 0:
                        if txt.find("记者") > 0 and txt.find("报道") > 0:
                            author = txt[txt.find("记者") + 2:txt.find("报道")]
                        elif txt.find("记者") > 0 and txt.find("报道") == -1:
                            author = txt[txt.find("记者") + 2:len(txt)]
                        elif txt.find("记者") == -1 and txt.find("报道") == -1:
                            author = txt.strip()
                    index += 1

                for k in self.keyword_list:
                    if content.find(k)>-1 or title.find(k)>-1:
                        key+=k+","
                if key:
                    key = key[0:len(key)-1]
                author = author.replace("记者", "").strip()
                if len(author)>6:
                    author = ""
                print(count)
                print(layout)
                print(url)
                print(title)
                print(author)
                count+=1
                # print(content)
                self.db.updatefinalUrl(url)
                self.db.SavefinalData(self.date,layout,url,title,author,key,content)
            except Exception as e1:
                error = e1.args
        self.driver.close()

    def export(self):
        rows = self.db.GetfinalData(self.date)
        lst = []
        for dataRow1 in rows:
            lst.append(dataRow1)
        count =1
        # dt = datetime.datetime.now().strftime("%Y-%m-%d")
        fileName = '金融时报_' + self.date + '.csv'
        header = "发表日期,关键字,作者,全文字数,标题,版面,链接,正文"
        if len(lst)>0:
            self.WriteData(header, fileName)

        for dataRow in lst:
            date = str(dataRow['date'])
            layout = str(dataRow['layout'])
            url = str(dataRow['url'])
            title = str(dataRow['title']).replace(",","").replace("\n"," ")
            author = str(dataRow['author']).replace(",","")
            key = str(dataRow['key']).replace(",","")
            wordcount = str(dataRow['wordcount'])
            content = str(dataRow['content']).replace(",","").replace("\n"," ")

            # txt = "\n%s,%s,%s,%s,%s,%s" % (
            #     date, key, title, author, wordcount, url)
            txt = "\n%s,%s,%s,%s,%s,%s,%s,%s" % (
                date, key, author, wordcount, title,layout, url, content)
            try:
                self.WriteData(txt, fileName)
            except Exception as e1:
                print(str(e1))
            print(count)
            count += 1


#
# dt = datetime.datetime.now().strftime("%Y-%m-%d")
# ym = datetime.datetime.now().strftime("%Y-%m")
# day = datetime.datetime.now().strftime("%d")
#
# codepath='E:/python36_crawl/mediaInfo/verycode.txt'
#
# logonUrl="http://epaper.financialnews.com.cn/dnis/client/jrsb/index.jsp"
# # firsturl="http://epaper.financialnews.com.cn/jrsb/html/2018-09/18/node_2.htm"
# firsturl="http://epaper.financialnews.com.cn/jrsb/html/"+ym+"/"+day+"/node_2.htm"
# # print(firsturl)
# keyword_list ="银保监会,央行,中国银行,中行,中银".split(",")
# exportPath="E:/News"
# codedir='E:\python36_crawl\Veriycode'
# obj = finalNews_IE(dt,logonUrl,firsturl,keyword_list,exportPath,codedir)
# # obj.saveUrls()
# obj.catchdata()
# obj.export()
# # obj.savePicture()
View Code

相关文章: