chengdongzi

人工智能—爬虫

 

抓取图片

# 导包
import requests
import re
from lxml import etree
import os


# 定义请求类
class PearVideo(object):

  # /定义抓取方法
  def get_countent(self,url,type):

    if type == \'index\':
      file_name = \'test_pear.html\'
    else:
      file_name = \'inner_pear.html\'
 
    # 使用os模块判断文件是否存在
    if not os.path.exists(\'test_pear.html\'):
 
      # 发送HTTP请求
      r = requests.get(url)
 
      # 解码
      html = r.content.decode("utf-8")
      print(html)
 
      # 第二种解码方案
      # html = r.content.decode(\'utf-8\').encode(\'gbk\',)
 
      # 断点
      # exit(-1)
 
      # 写文件,指定文件编码
      with open(\'./\'+file_name,\'w\',encoding=\'utf-8\') as f:
        f.write(html)
      return html
 
    else:
      # 读取文件返回
      with open(\'./\'+file_name,encoding=\'utf-8\') as f:
 
        contents = f.read()
      return contents
 
   # 定义数据匹配方法
  def get_xpath(self,html):

    # 匹配图片地址
    html_data_img = html.xpath(\'//div[@class="img"]/@style\')
    print(html_data_img)

    # 处理图片
    img_list = []
    for item in html_data_img:
      # item = item.replace(\'background-image:url(\',\'\').replace(\');\',\'\')
      # 使用正则匹配处理图片
      # 定义正则表达式  
      regex = re.compile("background-image: url\((.+?)\);")
      img_list.append(regex.findall(item)[0])
      # 图片下载
 
    for item in img_list:
      r = requests.get(item)
      # 写文件
      with open(\'./test_pear.png\',\'wb\') as f:
        f.write(r.content)
      # 断点
      exit(-1)
 
    print(img_list)
 

if __name__ == "__main__":
    # 实例化对象
    pearvideo = PearVideo()
    # 调用内置方法
    html = pearvideo.get_countent(\'https://www.pearvideo.com/\',type)
    # print(html)
    pearvideo.get_xpath(html)
 


 

抓取视频

# 导包
import requests
import re
from lxml import etree
import os


# 定义请求类
class PearVideo(object):

  # /定义抓取方法
  def get_countent(self,url,type):

    if type == \'index\':
      file_name = \'test_pear.html\'
    else:
      file_name = \'inner_pear.html\'
 
    # 使用os模块判断文件是否存在
    if not os.path.exists(\'test_pear.html\'):
 
      # 发送HTTP请求
      r = requests.get(url)
 
      # 解码
      html = r.content.decode("utf-8")
      print(html)
 
      # 第二种解码方案
      # html = r.content.decode(\'utf-8\').encode(\'gbk\',)
 
      # 断点
      # exit(-1)
 
      # 写文件,指定文件编码
      with open(\'./\'+file_name,\'w\',encoding=\'utf-8\') as f:
        f.write(html)
      return html
 
    else:
      # 读取文件返回
      with open(\'./\'+file_name,encoding=\'utf-8\') as f:
 
        contents = f.read()
      return contents
 
   # 定义数据匹配方法
  def get_xpath(self,html):
    # 转换格式
    html = etree.HTML(html)

    # 匹配内页地址
    html_data_url = html.xpath("//div[@class=\'actcontbd\']/a/@href")
    print(html_data_url)

    # 处理内页网址
    url_list = []
    for item in html_data_url:
      item = \'https://www.pearvideo.com/\'+ item
      url_list.append(item)
    print(url_list)
 
    # 爬取内页
    url_page = url_list[8]
    # print(url_list[8])
    # for item in url_list:
    inner_html = self.get_countent(url_page,\'inner\')

    # 匹配真实视频地址
    regex = re.compile(\'srcUrl="(.+?)"\')
    print(regex.findall(inner_html))


    # 下载视频 追加
    r = requests.get(regex.findall(inner_html)[0])
    with open("./test_pear.mp4",\'ab\') as f:
    f.write(r.content)



if __name__ == "__main__":
  # 实例化对象
  pearvideo = PearVideo()
  # 调用内置方法
  html = pearvideo.get_countent(\'https://www.pearvideo.com/\',type)
  # print(html)
  pearvideo.get_xpath(html)
 

 

 

 

 

 

多线程爬虫结构

# 导包
import threading
import requests
import time

# 定义线程容器
threads = []


# 定义计时器方法
def get_time():
  ms = time.ctime()
  return ms

# 定义抓取方法
def get_content(url):
  r = requests.get(url)
  print(r.status_code)


# 定义多线程
for item in range(20):
  mytherad = threading.Thread(target=get_content,args=(\'https://www.pearvideo.com\',))
  threads.append(mytherad)



if __name__ == "__main__":
  print(\'开始于:%s\' % get_time())

  # 同步请求
  # for x in range(20):
    # get_content(\'https://www.pearvideo.com\')

  # 开始异步多线程请求
  for t in threads:
    # 守护线程
    t.setDaemon(True)
    t.start()
  # 打印每一个线程的执行时间
  print(\'这一个执行到:%s\'% get_time())
  # 阻塞一下主线程
  t.join()


  print(\'结束于:%s\' % get_time())
 
 
 
多线程爬取视频
# 导入requests网络请求模块
import requests
# 导入lxml标签匹配模块
from lxml import etree
# 导入re 正则匹配模块
import re
#导入系统路径模块
import os
# 导入进程模块
import multiprocessing
import threading
 
 
# 存在视频网址
mylist = []

# 请求函数
def Data(url):
  #发送请求
  test = requests.get(url)
 
  # with open(\'./pa.html\',\'w\') as pa:
  # pa.write(test.text.encode(\'gbk\',\'ignore\').decode(\'gbk\',\'ignore\'))
 
  # 返回二进制流
  return test.content

# 匹配标签函数
def Pipa():
  # 调用请求函数
  res = Data(\'https://www.pearvideo.com/category_10\')
 
  # 利用etree完整HTML数据
  html = etree.HTML(res)
 
  # 匹配标签
  url = html.xpath(\'//*[@id="categoryList"]/li\')
 
  # 循环匹配到的标签进行操作
  for i in url:
 
    # 拼接完整的详情页网址
    data = \'https://www.pearvideo.com/\' + str(i.xpath(\'./div/a/@href\')[0])
 
    # 添加到准备好的列表内
    mylist.append(data)

# 定义好写入方法
def xiangqing(url):
 
    # 获取url切片用于视频名称
    name = str(url).split(\'/\')[-1] + \'.mp4\'
    print(name)
 
    # 调用请求方法获取详情页
    res = Data(url)
    #由于视频网址不在标签里而是在Jquery内所有没办法使用xpath 这里使用re匹配视频源所在网址
    url = re.findall(\'srcUrl="(.*?)",vdoUrl=srcUrl,skinRes="//www.pearvideo.com/domain/skin",videoCDN="//video.pearvideo.com";\',
str(res))[0]
 
    # 调用请求方法把视频所在的网址放进去获取资源
    res = Data(url)
 
    # 设置保存下载视频的路径
    path = "video/"
 
    # 判断路径是否存在
    if not os.path.exists(path):
 
    # 不存在则创建
    os.makedirs(path)
 
    #写入
    with open(path + name, "wb") as f:
      f.write(res)


if __name__ == \'__main__\':
  #调用进行添加列表
  Pipa()
  # print(mylist)
  # for i in mylist:
  # xiangqing(i)
  for i in mylist:
    a = threading.Thread(target=xiangqing, args=(i,))
    a.start()
    a.join()
 
posted on 2019-03-07 16:56  小东子!  阅读(640)  评论(1编辑  收藏  举报
 

分类:

技术点:

相关文章:

  • 2021-09-17
  • 2021-11-06
  • 2021-11-23
  • 2021-11-06
  • 2021-12-16
  • 2021-12-09
  • 2022-12-23
  • 2021-12-22
猜你喜欢
  • 2021-12-04
  • 2021-12-04
  • 2021-12-04
  • 2021-11-06
  • 2022-12-23
  • 2022-12-23
相关资源
相似解决方案