能用来做数据抓取的代码类型有很多,在Java领域,可以使用Jsoup
这样的库轻松完成网页内容的抓取和解析;而在Python生态系统中,则有像Scrapy这样功能强大的框架可供选择。今天我将使用Java和Jsoup
库完成一个简单的通用爬虫模版,并且有可扩展性,方便修改。
下面是一个使用Java和Jsoup库实现的简单、通用且可扩展的爬虫程序。该程序支持多级爬取、自定义解析规则、结果存储扩展和并发控制:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.*;
import java.util.concurrent.*;
import java.util.function.Function;public class SimpleCrawler {// 爬虫配置类public static class CrawlerConfig {private String startUrl;private int maxDepth = 1;private int timeoutMillis = 5000;private int maxPages = 100;private int maxConcurrency = 10;private List<DataExtractor> extractors = new ArrayList<>();private Function<String, Boolean> urlFilter = url -> true;public CrawlerConfig startUrl(String startUrl) {this.startUrl = startUrl;return this;}public CrawlerConfig maxDepth(int maxDepth) {this.maxDepth = maxDepth;return this;}public CrawlerConfig timeoutMillis(int timeoutMillis) {this.timeoutMillis = timeoutMillis;return this;}public CrawlerConfig maxPages(int maxPages) {this.maxPages = maxPages;return this;}public CrawlerConfig maxConcurrency(int maxConcurrency) {this.maxConcurrency = maxConcurrency;return this;}public CrawlerConfig addExtractor(DataExtractor extractor) {this.extractors.add(extractor);return this;}public CrawlerConfig urlFilter(Function<String, Boolean> urlFilter) {this.urlFilter = urlFilter;return this;}}// 数据提取器接口public interface DataExtractor {String getName();void extract(Document doc, Map<String, Object> result);List<String> getLinks(Document doc);}// 结果处理器接口public interface ResultHandler {void handle(String url, Map<String, Object> data);}// 核心爬虫类public static class CrawlerEngine {private final CrawlerConfig config;private final Set<String> visitedUrls = ConcurrentHashMap.newKeySet();private final Queue<PageTask> taskQueue = new ConcurrentLinkedQueue<>();private final ExecutorService executor;private final ResultHandler resultHandler;public CrawlerEngine(CrawlerConfig config, ResultHandler resultHandler) {this.config = config;this.resultHandler = resultHandler;this.executor = Executors.newFixedThreadPool(config.maxConcurrency);}public void start() {taskQueue.add(new PageTask(config.startUrl, 0));visitedUrls.add(config.startUrl);List<Future<?>> futures = new ArrayList<>();for (int i = 0; i < config.maxConcurrency; i++) {futures.add(executor.submit(this::processTasks));}// 等待所有任务完成for (Future<?> future : futures) {try {future.get();} catch (InterruptedException | ExecutionException e) {Thread.currentThread().interrupt();}}executor.shutdown();}private void processTasks() {while (!taskQueue.isEmpty() && visitedUrls.size() < config.maxPages) {PageTask task = taskQueue.poll();if (task == null) continue;try {Document doc = Jsoup.connect(task.url).timeout(config.timeoutMillis).userAgent("Mozilla/5.0 (compatible; SimpleCrawler/1.0)").get();// 处理页面数据Map<String, Object> pageData = new HashMap<>();for (DataExtractor extractor : config.extractors) {extractor.extract(doc, pageData);}resultHandler.handle(task.url, pageData);// 处理深层链接if (task.depth < config.maxDepth) {for (DataExtractor extractor : config.extractors) {for (String link : extractor.getLinks(doc)) {String absUrl = makeAbsoluteUrl(task.url, link);if (shouldVisit(absUrl)) {taskQueue.add(new PageTask(absUrl, task.depth + 1));visitedUrls.add(absUrl);}}}}} catch (Exception e) {System.err.println("Error processing: " + task.url + " - " + e.getMessage());}}}private boolean shouldVisit(String url) {return url != null && !visitedUrls.contains(url) && config.urlFilter.apply(url) && visitedUrls.size() < config.maxPages;}private String makeAbsoluteUrl(String baseUrl, String relativeUrl) {try {return new java.net.URL(new java.net.URL(baseUrl), relativeUrl).toString();} catch (Exception e) {return null;}}private static class PageTask {String url;int depth;PageTask(String url, int depth) {this.url = url;this.depth = depth;}}}// 示例使用public static void main(String[] args) {// 1. 创建配置CrawlerConfig config = new CrawlerConfig().startUrl("https://example.com").maxDepth(2).maxPages(50).maxConcurrency(5).urlFilter(url -> url.startsWith("https://example.com")).addExtractor(new TitleExtractor()).addExtractor(new LinkExtractor("a[href]", "href")).addExtractor(new ContentExtractor("div.content"));// 2. 创建结果处理器ResultHandler consoleHandler = (url, data) -> {System.out.println("\nURL: " + url);data.forEach((key, value) -> System.out.println(key + ": " + value));};// 3. 启动爬虫new CrawlerEngine(config, consoleHandler).start();}// 示例提取器:标题提取static class TitleExtractor implements DataExtractor {@Overridepublic String getName() { return "title"; }@Overridepublic void extract(Document doc, Map<String, Object> result) {String title = doc.title();if (title != null && !title.isEmpty()) {result.put(getName(), title);}}@Overridepublic List<String> getLinks(Document doc) {return Collections.emptyList(); // 不从此提取器获取链接}}// 示例提取器:链接提取static class LinkExtractor implements DataExtractor {private final String selector;private final String attr;LinkExtractor(String selector, String attr) {this.selector = selector;this.attr = attr;}@Overridepublic String getName() { return "links"; }@Overridepublic void extract(Document doc, Map<String, Object> result) {// 链接提取通常不存储在结果中}@Overridepublic List<String> getLinks(Document doc) {List<String> links = new ArrayList<>();Elements elements = doc.select(selector);for (Element el : elements) {String link = el.attr("abs:" + attr);if (!link.isEmpty()) links.add(link);}return links;}}// 示例提取器:内容提取static class ContentExtractor implements DataExtractor {private final String selector;ContentExtractor(String selector) {this.selector = selector;}@Overridepublic String getName() { return "content"; }@Overridepublic void extract(Document doc, Map<String, Object> result) {Elements elements = doc.select(selector);if (!elements.isEmpty()) {result.put(getName(), elements.first().text());}}@Overridepublic List<String> getLinks(Document doc) {return Collections.emptyList();}}
}
核心设计特点:
-
模块化设计:
CrawlerConfig
:集中管理爬虫配置DataExtractor
:可扩展的数据提取接口ResultHandler
:结果处理接口CrawlerEngine
:核心爬取逻辑
-
可扩展性:
- 通过实现
DataExtractor
接口添加新的解析规则 - 通过实现
ResultHandler
支持不同输出方式(文件、数据库等) - 使用函数式接口
urlFilter
自定义URL过滤逻辑
- 通过实现
-
并发控制:
- 线程池管理并发请求
ConcurrentHashMap
保证线程安全ConcurrentLinkedQueue
任务队列
-
健壮性特性:
- 连接超时设置
- URL规范化处理
- 异常捕获机制
- 最大页面限制
-
配置选项:
- 爬取深度控制
- 最大页面限制
- 并发线程数
- 请求超时时间
- 自定义URL过滤
使用示例:
public static void main(String[] args) {// 创建爬虫配置CrawlerConfig config = new CrawlerConfig().startUrl("https://news.example.com").maxDepth(3).maxPages(100).urlFilter(url -> url.contains("/articles/")).addExtractor(new TitleExtractor()).addExtractor(new LinkExtractor("a.article-link", "href")).addExtractor(new AuthorExtractor("span.author")) // 自定义提取器.addExtractor(new DateExtractor("time.published")); // 自定义提取器// 创建结果处理器(可替换为数据库存储)ResultHandler dbHandler = (url, data) -> {// 这里实现数据库存储逻辑System.out.println("Saving to DB: " + url);};// 启动爬虫new CrawlerEngine(config, dbHandler).start();
}
自定义提取器示例:
// 作者信息提取器
static class AuthorExtractor implements DataExtractor {private final String selector;AuthorExtractor(String selector) {this.selector = selector;}@Overridepublic String getName() { return "author"; }@Overridepublic void extract(Document doc, Map<String, Object> result) {Element author = doc.selectFirst(selector);if (author != null) {result.put(getName(), author.text());}}@Overridepublic List<String> getLinks(Document doc) {return Collections.emptyList();}
}
最佳实践建议:
- 遵守robots.txt:在真实项目中使用前添加robots.txt解析
- 限速策略:添加请求延迟避免被封禁
- 错误处理:增强网络异常处理和重试机制
- 代理支持:添加代理轮换功能
- 去重策略:使用Bloom过滤器优化URL去重
- 分布式扩展:对于大规模爬取,可改造为分布式架构
此爬虫框架提供了良好的基础结构,我们在实际使用中可以根据具体需求扩展更多功能,如:添加JavaScript渲染支持(使用Selenium或HtmlUnit)、实现自动翻页功能、添加验证码识别模块、集成更复杂的调度算法。使用时请确保遵守目标网站的爬取政策。