- Elegantly expressive abstraction of simple web crawling and data extraction logic.
- Abstracts easily over any html parser libraries like Jsoup, htmlparser etc
- Light weight (30 classes)
- API itself has no dependencies. To use it however you choose your own html parser implementation and plug-in.
// Configure CrawlerContext
Function<Document, Set<String>> linkExtractor;
com.bc.webcrawler.UrlParser<Document> urlParser;
final CrawlerContext<Document> context = CrawlerContext.builder(Document.class)
.crawlUrlTest((link) -> true)
.parseUrlTest((link) -> true)
String startUrl;
// Create the Crawler
final Crawler<Document> crawler = context.newCrawler(Collections.singleton(startUrl));
// Use the crawler
while(crawler.hasNext()) {
final Document doc = crawler.next();
final CrawlMetaData metaData = crawler.getMetaData();
"Attempted: {0}, failed: {1}",
metaData.getAttempted(), metaData.getFailed()));
public class ReadMeJsoupCrawler {
private static final Logger logger = Logger.getLogger(ReadMeJsoupCrawler.class.getName());
public static class JsoupLinkExtractor implements Function<Document, Set<String>> {
public Set<String> apply(Document doc) {
final Elements elements = doc.select("a[href]");
final Set<String> links = elements.isEmpty() ? Collections.EMPTY_SET : new HashSet(elements.size());
for(Element element : elements) {
final String link = element.attr("abs:href");
if(link != null && !link.isEmpty()) {
return links;
public static class JsoupUrlParser implements UrlParser<Document> {
private final int timeout;
private final int maxBodySize;
private final Map<String, String> cookies;
public JsoupUrlParser(int timeout, int maxBodySize) {
this.timeout = timeout;
this.maxBodySize = maxBodySize;
this.cookies = new HashMap<>();
public Document parse(String link) throws MalformedURLException, IOException {
final URL url = new URL(link);
final Connection.Response res = HttpConnection
final Document doc = res.parse();
return doc;
private void collectCookies(Connection.Response res) {
final Map<String, String> cookiesFromRes = res.cookies();
logger.finer(() -> "Cookies from response: " + cookiesFromRes);
public Map<String, String> getCookies() {
return new LinkedHashMap(cookies);
public static void main(String... args) throws IOException {
final int connectTimeout = 10_000;
final int readTimeout = 20_000;
final int maxBodySize = 100_000_000;
final UrlParser<Document> parser;
parser = new JsoupUrlParser(connectTimeout + readTimeout, maxBodySize);
final String baseUrl = "http://www.buzzwears.com";
final String startUrl = baseUrl;
final Pattern linkToScrappPattern = Pattern.compile("\\d{1,}_"); //Pattern.compile(".*");
final Predicate<String> linkToScrappTest = (link) -> linkToScrappPattern.matcher(link).find();
final ResumeHandler resumeHandler = new ResumeHandlerInMemoryCache(
final String robotsCss = "meta[name=robots]";
final Predicate<Document> docIsNoIndex = (doc) -> {
final Element robots = doc.select(robotsCss).first();
final String content = robots == null ? null : robots.attr("content");
return content == null ? false : content.toLowerCase().contains("noindex");
final Predicate<Document> docIsNoFollow = (doc) -> {
final Element robots = doc.select(robotsCss).first();
final String content = robots == null ? null : robots.attr("content");
return content == null ? false : content.toLowerCase().contains("nofollow");
final int n = 1;
final CrawlerContext<Document> context = CrawlerContext.builder(Document.class)
.crawlLimit(100 * n)
.crawlUrlTest((link) -> true)
.linksExtractor(new JsoupLinkExtractor())
.maxFailsAllowed(9 * n)
.parseLimit(10 * n)
.parseUrlTest((link) -> true)
.retryOnExceptionTestSupplier(() -> (exception) -> false)
.timeoutMillis(3600_000 * n)
.urlFormatter((link) -> link)
final Crawler<Document> crawler = context.newCrawler(Collections.singleton(startUrl));
while(crawler.hasNext()) {
final Document doc = crawler.next();
final CrawlMetaData metaData = crawler.getMetaData();
"Attempted: {0}, failed: {1}",
metaData.getAttempted(), metaData.getFailed()));
if(doc == null) {
System.err.println("Failed: " + crawler.getCurrentUrl());
final String url = doc.location();
System.out.println("URL: " + url + "\nTitle: " + doc.title());
final boolean isToScrapp = linkToScrappTest.test(url);
System.out.println("Scrapp: " + isToScrapp + ", URL: " + url);
if(isToScrapp) {
final Element idElem = doc.getElementsByAttributeValue("itemprop", "productID").first();
final Element priceElem = doc.getElementsByAttributeValue("itemprop", "price").first();
System.out.println("Product. ID: " + (idElem==null?null:idElem.text()) +
", price: " + (priceElem==null?null:priceElem.text()));
The above code depends on: