1.什么是WebMagi

WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。

http://webmagic.io/

特性:

  • 简单的API,可快速上手
  • 模块化的结构,可轻松扩展
  • 提供多线程和分布式支持

 

2.使用案例:

1.添加maven依赖

 

<dependencies>


    <dependency>
        <groupId>us.codecraft</groupId>
        <artifactId>webmagic-core</artifactId>
        <version>0.6.1</version>
    </dependency>
    <dependency>
        <groupId>us.codecraft</groupId>
        <artifactId>webmagic-extension</artifactId>
        <version>0.6.1</version>
    </dependency>
    <dependency>
        <groupId>mysql</groupId>
        <artifactId>mysql-connector-java</artifactId>
        <version>5.1.42</version>
    </dependency>
    <dependency>
        <groupId>com.oracle</groupId>
        <artifactId>ojdbc6</artifactId>
        <version>1.0.0</version>
    </dependency>


</dependencies>

 

2.核心类得写法

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class FootProcessor implements PageProcessor {

    //抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
    private Site site = Site.me().setRetryTimes(10).setSleepTime(1000);
    //博文数量
    private static int num = 0;
    //数据库持久化对象,用于将博文信息存入数据库
    private BlogDao blogDao = new BlogDaoImpl();
    public static void main(String[] args) throws Exception {
        long startTime ,endTime;
        System.out.println("========懂球帝小爬虫【启动】喽!=========");
        startTime = new Date().getTime();
        Spider.create(new FootProcessor()).addUrl("http://www.dongqiudi.com/data").thread(5).run();
        endTime = new Date().getTime();
        System.out.println("========懂球帝小爬虫【结束】喽!=========");
        System.out.println("一共爬用时为:"+(endTime-startTime)/1000+"s");
    }

    @Override
    public void process(Page page) {

//            String content1 = page.getHtml().get();
            try {
                /*实例化BlogInfo,方便持久化存储。*/
                position blog = new position();
                //获取id
                String id  = page.getHtml().xpath("//tr[@class='top_rank']/td[1]/text()").get();
                System.out.println(id);
                //获取排名
                String position = page.getHtml().xpath("//tr[@class='top_rank']/td[1]/text()").get();
                System.out.println(position);
                //获取队伍名字
                String name = page.getHtml().xpath("//tr[@class='top_rank']/td/a/text()").get();
                System.out.println(name);
                //获取队伍分数
                String grade = page.getHtml().xpath("//tr[@class='top_rank']/td[10]/text()").get().trim();
                System.out.println(grade);
                blog.setId(id);
                blog.setPosition(position);
                blog.setname(name);
                blog.setgrade(grade);

                num++;//博文数++

                System.out.println("num:" + num + " " + blog.toString());//输出对象
                blogDao.saveBlog(blog);//保存博文信息到数据库
            }catch (Exception e){
                e.printStackTrace();
            }
        }


    @Override
    public Site getSite() {

        return this.site;
    }

}

3.数据持久化

public interface BlogDao {
    /**
     * 保存博文信息
     * @param blog
     * @return
     */
    public int saveBlog(position blog);
}

 

 

 

import java.util.ArrayList;
import java.util.List;

public class BlogDaoImpl implements BlogDao{
    @Override
    public int saveBlog(position blog){
        DBHelper dbhelper = new DBHelper();
        StringBuffer sql = new StringBuffer();
        sql.append("INSERT INTO (id,position,name,grade)")
                .append("VALUES (? , ? , ? , ? ) ");
        //设置 sql values 的值
        List<String> sqlValues = new ArrayList<String>();
        sqlValues.add(blog.getId());
        sqlValues.add(blog.getPosition());
        sqlValues.add(blog.getname());
        sqlValues.add(blog.getgrade());
        int result = dbhelper.executeUpdate(sql.toString(), sqlValues);
        return result;
    }
}

 

 

import java.sql.*;
import java.util.List;

public class DBHelper {

    public static final String driver_class = "com.mysql.jdbc.Driver";
    public static final String driver_url = "jdbc:mysql://localhost/football?useunicode=true&characterEncoding=utf8";
    public static final String user = "root";
    public static final String password = "root";
    private static Connection conn = null;
    private PreparedStatement pst = null;
    private ResultSet rst = null;
    /**
     * Connection
     */
    public DBHelper() {
        try {
            conn = DBHelper.getConnInstance();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 单例模式
     * 线程同步
     * @return
     */
    private static synchronized Connection getConnInstance() {
        if(conn == null){
            try {
                Class.forName(driver_class);
                conn = DriverManager.getConnection(driver_url, user, password);
            } catch (ClassNotFoundException e) {
                e.printStackTrace();
            } catch (SQLException e) {
                e.printStackTrace();
            }
            System.out.println("连接数据库成功");
        }
        return conn;
    }
    /**
     * close
     */
    public void close() {

        try {
            if (conn != null) {
                DBHelper.conn.close();
            }
            if (pst != null) {
                this.pst.close();
            }
            if (rst != null) {
                this.rst.close();
            }
            System.out.println("关闭数据库成功");
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }
    /**
     * query
     *
     * @param sql
     * @param sqlValues
     * @return ResultSet
     */
    public ResultSet executeQuery(String sql, List<String> sqlValues) {
        try {
            pst = conn.prepareStatement(sql);
            if (sqlValues != null && sqlValues.size() > 0) {
                setSqlValues(pst, sqlValues);
            }
            rst = pst.executeQuery();
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return rst;
    }

    /**
     * update
     *
     * @param sql
     * @param sqlValues
     * @return result
     */
    public int executeUpdate(String sql, List<String> sqlValues) {
        int result = -1;
        try {
            pst = conn.prepareStatement(sql);
            if (sqlValues != null && sqlValues.size() > 0) {
                setSqlValues(pst, sqlValues);
            }
            result = pst.executeUpdate();
        } catch (SQLException e) {
            e.printStackTrace();
        }

        return result;
    }

    /**
     * sql set value
     *
     * @param pst
     * @param sqlValues
     */
    private void setSqlValues(PreparedStatement pst, List<String> sqlValues) {
        for (int i = 0; i < sqlValues.size(); i++) {
            try {
                pst.setObject(i + 1, sqlValues.get(i));
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

}

4,实体类

public class position {
    private String id;
    private String position;
    private String name;
    private String grade;

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getPosition() {
        return position;
    }

    public void setPosition(String position) {
        this.position = position;
    }

    public String getname() {
        return name;
    }

    public void setname(String name) {
        this.name = name;
    }

    public String getgrade() {
        return grade;
    }

    public void setgrade(String grade) {
        this.grade = grade;
    }
}

4.建立相关得表

 

Java爬虫(四)框架的简单使用

 

5.运行结果

Java爬虫(四)框架的简单使用

数据库报错,需要进行相关修改

待更新

 

相关文章:

  • 2021-12-10
  • 2021-12-25
  • 2021-11-14
  • 2021-12-06
  • 2021-10-11
猜你喜欢
  • 2021-07-25
  • 2021-07-02
  • 2021-08-17
  • 2021-08-09
  • 2021-05-22
  • 2021-06-30
  • 2021-12-04
相关资源
相似解决方案