jessezs

简单爬虫操作:1.简单爬取网页数据并输出 2.爬取数据打印到xls表格中

 

 

安装python环境参考菜鸟教程:

传送门:https://www.runoob.com/w3cnote/python-pip-install-usage.html

1..简单爬取网页数据并输出

 

 

import requests
from lxml import etree
import xlwt
# 获取源码
html = requests.get("https://www.ghpym.com/category/videos")
# 打印源码
#print (html.text)

etree_html = etree.HTML(html.text)   #将源码转化为能被 XPath 匹配的格式
#
#//*[@id="wrap"]/div/div/div/ul/li[1]/div[2]/h2/a/text()
content = etree_html.xpath(\'//*[@id="wrap"]/div/div/div/ul/li/div[2]/h2/a/@href\')


for each in content:
    replace = each.replace(\'\n\',\'\').replace(\' \',\'\')       #去掉换行符和空格
    if replace ==\'\n\' or replace == "":
        continue
    else:
     print (replace)
     
     



content = etree_html.xpath(\'//*[@id="wrap"]/div/div/div/ul/li/div[2]/h2/a/text()\')

for each in content:
    replace = each.replace(\'\n\',\'\').replace(\' \',\'\')
    if replace ==\'\n\' or replace == "":
        continue
    else:
     print (replace)

print("完成")

 

2.爬取数据打印到xls表格中

 

 

# coding:utf-8
from lxml import etree
import requests
import xlwt
title=[]
def get_film_name(url):
    html = requests.get(url).text #这里一般先打印一下 html 内容,看看是否有内容再继续。
    #print(html)
    s=etree.HTML(html) #将源码转化为能被 XPath 匹配的格式
    filename =s.xpath(\'//*[@id="wrap"]/div/div/div/ul/li/div[2]/h2/a/@href\') #返回为一列表
    print (filename)
    title.extend(filename)
 
def get_all_film_name():
    for i in range(0, 250, 25):
        url = \'https://www.ghpym.com/category/videos\'.format(i)
        get_film_name(url)
if \'_main_\':
    myxls=xlwt.Workbook()
    sheet1=myxls.add_sheet(u\'top250\',cell_overwrite_ok=True)
    get_all_film_name()
    for i in range(0,len(title)):
        sheet1.write(i,0,i+1)
        sheet1.write(i,1,title[i])
    myxls.save(\'top250.xls\')
    print("完成")

 

分类:

技术点:

相关文章:

  • 2021-11-22
  • 2022-12-23
  • 2021-08-16
  • 2021-08-03
  • 2021-07-22
  • 2022-12-23
  • 2021-11-29
  • 2021-07-31
猜你喜欢
  • 2022-12-23
  • 2021-08-23
  • 2022-12-23
  • 2021-11-19
  • 2021-07-05
  • 2021-07-25
  • 2021-12-31
相关资源
相似解决方案