WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。
- 简单的API,可快速上手
- 模块化的结构,可轻松扩展
- 提供多线程和分布式支持
点击查看 WebMagic官网
1.引入Webmagic依赖
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.5</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.5</version>
</dependency>
2.定义TestProcessor类
根据匹配器获取数据,封装数据
package com.bdbk.spider.processor;
import com.bdbk.spider.entity.User;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@Component
public class TestProcessor implements PageProcessor {
private static final Site SITE = Site.me()
// 设置字符编码集
.setCharset("GBK")
// 设置Http连接重试次数
.setRetryTimes(3)
// 设置线程休眠时间
.setSleepTime(3000);
/**
* 页面分析
* @param page 下载结果封装成Page对象
*/
@Override
public void process(Page page) {
Html html = page.getHtml();
//解析页面内容
List<Selectable> nodes = html.css("#table").css("tbody > tr").nodes();
List<User> userList = new ArrayList<>();
for (int i = 1; i < nodes.size(); i++) {
Selectable selectable = nodes.get(i);
String id = selectable.css("td:nth-child(1) > span > i", "text").toString();
String name = selectable.css("td:nth-child(2) > span > i", "text").toString();
User user = new User();
user.setId(Integer.parseInt(id));
user.setName(name);
userList.add(user);
}
//把结果传递给pipeline
page.putField("data", userList);
}
/**
* 返回site对象
* site是站点配置 使用Site,me()创建site对象
* @return
*/
@Override
public Site getSite() {
return SITE;
}
public static void main(String[] args) {
int a= 1000*60*60*24*7;
System.out.println(a);
}
}
3.定义TestPipeline类
数据后续的处理,保存为文件或者存入数据库
package com.bdbk.spider.pipline;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.bdbk.spider.entity.User;
import com.bdbk.spider.mapper.UserMapper;
import com.bdbk.spider.service.UserService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
@Component
public class TestPipeline implements Pipeline {
@Autowired
private UserService userService;
@Autowired
private UserMapper userMapper;
@Override
public void process(ResultItems resultItems, Task task) {
List<User> userList = resultItems.get("data");
if (CollUtil.isNotEmpty(userList)) {
for (User user : userList) {
userService.save(user);
}
}
}
}
4.单次执行
package com.bdbk.spider.controller;
import com.bdbk.spider.pipeline.TestPipeline;
import com.bdbk.spider.processor.TestProcessor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.stereotype.Controller;
import us.codecraft.webmagic.Spider;
public void test() {
Spider.create(TestProcessor)
.addUrl("https://badianboke.com/user")
.addPipeline(TestPipeline)
.run();
}
5.定时执行
可以结合sringboot定时器实现定时爬取功能
点击查看 springboot定时任务使用教程
package com.bdbk.spider.config;
import com.bdbk.spider.common.zgzcw.ZgzcwPipeline;
import com.bdbk.spider.common.zgzcw.ZgzcwProcess;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.web.bind.annotation.GetMapping;
import us.codecraft.webmagic.Spider;
@Component
public class ScheduleConfig {
@Autowired
private TestProcessor testProcessor;
@Autowired
private TestPipeline testPipeline;
/**
* 用户列表爬取
*/
//@Scheduled(cron = "0 0 0/4 * * ?") //定时器定义,设置执行时间,每隔4个小时
private void testCron() {
Spider.create(testProcessor)
.addUrl("https://www.badianboke.com/")
.addPipeline(testPipeline)
.run();
}