简单爬虫操作：1.简单爬取网页数据并输出 2.爬取数据打印到xls表格中

安装python环境参考菜鸟教程：

传送门：https://www.runoob.com/w3cnote/python-pip-install-usage.html

1..简单爬取网页数据并输出

import requests
from lxml import etree
import xlwt
# 获取源码
html = requests.get("https://www.ghpym.com/category/videos")
# 打印源码
#print (html.text)

etree_html = etree.HTML(html.text)   #将源码转化为能被 XPath 匹配的格式
#
#//*[@id="wrap"]/div/div/div/ul/li[1]/div[2]/h2/a/text()
content = etree_html.xpath(\'//*[@id="wrap"]/div/div/div/ul/li/div[2]/h2/a/@href\')


for each in content:
    replace = each.replace(\'\n\',\'\').replace(\' \',\'\')       #去掉换行符和空格
    if replace ==\'\n\' or replace == "":
        continue
    else:
     print (replace)
     
     



content = etree_html.xpath(\'//*[@id="wrap"]/div/div/div/ul/li/div[2]/h2/a/text()\')

for each in content:
    replace = each.replace(\'\n\',\'\').replace(\' \',\'\')
    if replace ==\'\n\' or replace == "":
        continue
    else:
     print (replace)

print("完成")

2.爬取数据打印到xls表格中

# coding:utf-8
from lxml import etree
import requests
import xlwt
title=[]
def get_film_name(url):
    html = requests.get(url).text #这里一般先打印一下 html 内容，看看是否有内容再继续。
    #print(html)
    s=etree.HTML(html) #将源码转化为能被 XPath 匹配的格式
    filename =s.xpath(\'//*[@id="wrap"]/div/div/div/ul/li/div[2]/h2/a/@href\') #返回为一列表
    print (filename)
    title.extend(filename)
 
def get_all_film_name():
    for i in range(0, 250, 25):
        url = \'https://www.ghpym.com/category/videos\'.format(i)
        get_film_name(url)
if \'_main_\':
    myxls=xlwt.Workbook()
    sheet1=myxls.add_sheet(u\'top250\',cell_overwrite_ok=True)
    get_all_film_name()
    for i in range(0,len(title)):
        sheet1.write(i,0,i+1)
        sheet1.write(i,1,title[i])
    myxls.save(\'top250.xls\')
    print("完成")