diff --git a/pom.xml b/pom.xml
index d016d0a92..115e4d952 100644
--- a/pom.xml
+++ b/pom.xml
@@ -193,6 +193,11 @@
jedis
2.9.3
+
+ net.jcip
+ jcip-annotations
+ 1.0
+
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index 44fb7fa4d..a5c30982c 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -80,7 +80,10 @@
com.alibaba
fastjson
-
+
+ net.jcip
+ jcip-annotations
+
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
index 72cc7d058..726955382 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@@ -52,7 +52,7 @@ public class Site {
*
* @return new site
*/
- public static Site me() {
+ public static Site me() {
return new Site();
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
index ee94581ad..666975974 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
@@ -32,6 +32,7 @@
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
+import org.apache.http.ssl.SSLContexts;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -142,7 +143,15 @@ public void process(
connectionManager.setDefaultSocketConfig(socketConfig);
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
generateCookie(httpClientBuilder, site);
- return httpClientBuilder.build();
+ SSLContext ctx = null;
+ try {
+ ctx = SSLContexts.custom().useProtocol("TLSv1.2").build();
+ } catch (NoSuchAlgorithmException e) {
+ logger.warn("CloseableHttpClient getClient #NoSuchAlgorithmException,{}", e);
+ } catch (KeyManagementException e) {
+ logger.warn("CloseableHttpClient getClient #NoSuchAlgorithmException ,{} error", e);
+ }
+ return httpClientBuilder.create().setSSLContext(ctx).build();
}
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
index be9fd7cc2..409c5775a 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
@@ -1,5 +1,6 @@
package us.codecraft.webmagic.pipeline;
+import net.jcip.annotations.ThreadSafe;
import org.apache.commons.codec.digest.DigestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -19,7 +20,9 @@
* @author code4crafter@gmail.com
* @since 0.1.0
*/
-public class FilePipeline extends FilePersistentBase implements Pipeline {
+@ThreadSafe
+public class FilePipeline extends FilePersistentBase implements Pipeline {
+
private Logger logger = LoggerFactory.getLogger(getClass());
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java
index f6ad87e05..2949f8c8a 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java
@@ -39,7 +39,7 @@ public static void main(String[] args) {
//multidownload
List list = new ArrayList();
list.add(String.format(urlTemplate,"风力发电"));
- list.add(String.format(urlTemplate,"太阳能"));
+ // list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate,"地热发电"));
List resultItemses = spider.getAll(list);
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/DownImgUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/DownImgUtil.java
new file mode 100644
index 000000000..d4a285064
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/DownImgUtil.java
@@ -0,0 +1,83 @@
+package us.codecraft.webmagic.processor.example;
+
+
+import java.io.*;
+import java.net.HttpURLConnection;
+import java.net.URL;
+
+/**
+ * Created by wpx on 2018/7/17.
+ */
+public class DownImgUtil {
+ /**
+ * 从网络Url中下载文件
+ * @param urlStr
+ * @param fileName
+ * @param savePath
+ * @throws IOException
+ */
+ public static void downLoadFromUrl(String urlStr,String fileName,String savePath) throws IOException{
+ System.setProperty("javax.net.debug", "all");
+
+ URL url = new URL(urlStr);
+ HttpURLConnection conn = (HttpURLConnection)url.openConnection();
+ conn.setRequestMethod("GET");
+ //设置超时间为3秒
+ conn.setConnectTimeout(3*1000);
+ //防止屏蔽程序抓取而返回403错误
+ conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
+
+ //得到输入流
+ InputStream inputStream = conn.getInputStream();
+ //获取自己数组
+ byte[] getData = readInputStream(inputStream);
+
+ //文件保存位置
+ File saveDir = new File(savePath);
+ if(!saveDir.exists()){
+ saveDir.mkdir();
+ }
+ File file = new File(saveDir+File.separator+fileName);
+ FileOutputStream fos = new FileOutputStream(file);
+ fos.write(getData);
+ if(fos!=null){
+ fos.close();
+ }
+ if(inputStream!=null){
+ inputStream.close();
+ }
+
+
+ System.out.println("info:"+url+" download success");
+
+ }
+
+
+
+ /**
+ * 从输入流中获取字节数组
+ * @param inputStream
+ * @return
+ * @throws IOException
+ */
+ public static byte[] readInputStream(InputStream inputStream) throws IOException {
+ byte[] buffer = new byte[1024];
+ int len = 0;
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ while((len = inputStream.read(buffer)) != -1) {
+ bos.write(buffer, 0, len);
+ }
+ bos.close();
+ return bos.toByteArray();
+ }
+
+ public static void main(String[] args) {
+ try{
+ downLoadFromUrl("http://img1.mm131.me/pic/4170/15.jpg", "百度.jpg","D:\\webimg");
+ }catch (Exception e) {
+ // TODO: handle exception
+ System.out.println("");
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java
index e93ab4cd5..5b65b7b08 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java
@@ -6,17 +6,21 @@
import us.codecraft.webmagic.processor.PageProcessor;
/**
+ * 这部分我们直接通过GithubRepoPageProcessor这个例子来介绍PageProcessor的编写方式。
+ * 我将PageProcessor的定制分为三个部分,分别是爬虫的配置、页面元素的抽取和链接的发现。
+ *
* @author code4crafter@gmail.com
* @since 0.3.2
*/
public class GithubRepoPageProcessor implements PageProcessor {
+ // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
+ // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
@Override
public void process(Page page) {
- page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
- page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
+ // 部分二:定义如何抽取页面信息,并保存下来
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
@@ -24,6 +28,13 @@ public void process(Page page) {
page.setSkip(true);
}
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
+ // 部分三:从页面发现后续的url地址来抓取
+ /**
+ * page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()用于获取所有满足"(https:/ /github\.com/\w+/\w+)"
+ * 这个正则表达式的链接,page.addTargetRequests()则将这些链接加入到待抓取的队列中去。
+ */
+ page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
+ page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
}
@Override
@@ -32,6 +43,9 @@ public Site getSite() {
}
public static void main(String[] args) {
- Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
+ //https://blog.csdn.net/bbc2005/article/details/80890829
+ //https://www.cnblogs.com/sunny08/p/8038440.html
+ // System.setProperty("javax.net.debug", "all"); //打印网络连接握手信息
+ Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(1).run();
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ImgPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ImgPageProcessor.java
new file mode 100644
index 000000000..04d740eff
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ImgPageProcessor.java
@@ -0,0 +1,77 @@
+package us.codecraft.webmagic.processor.example;
+
+import org.apache.commons.collections.CollectionUtils;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * 这部分我们直接通过GithubRepoPageProcessor这个例子来介绍PageProcessor的编写方式。
+ * 我将PageProcessor的定制分为三个部分,分别是爬虫的配置、页面元素的抽取和链接的发现。
+ *
+ * @author code4crafter@gmail.com
+ * @since 0.3.2
+ */
+public class ImgPageProcessor implements PageProcessor {
+ // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
+
+ private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
+
+ // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
+ @Override
+ public void process(Page page) {
+ // 部分二:定义如何抽取页面信息,并保存下来
+ String imgName = page.getHtml().xpath("//div[@class='content-pic']/a/img//@alt").toString();
+ String imgUrl = page.getHtml().xpath("//div[@class='content-pic']/a/img//@src").toString();
+ String urlpre = "";
+ URL url = null;
+ try {
+ url = new URL(page.getUrl().toString());
+ System.out.println("");
+ urlpre = url.getProtocol()+"://" + url.getHost() +"/"+ url.getPath().split("/")[1];
+ } catch (MalformedURLException e) {
+ e.printStackTrace();
+ }
+ if(null!=imgUrl && ""!= imgUrl && ""!=imgName){
+ try {
+ DownImgUtil.downLoadFromUrl(imgUrl, imgName,"");
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+
+ // 部分三:从页面发现后续的url地址来抓取
+ /**
+ * page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()用于获取所有满足"(https:/ /github\.com/\w+/\w+)"
+ * 这个正则表达式的链接,page.addTargetRequests()则将这些链接加入到待抓取的队列中去。
+ */
+ List urls = page.getHtml().links().xpath("/html/body/div[6]/div[3]//@href").all();
+ List handledUrls = new ArrayList();
+ if(CollectionUtils.isNotEmpty(urls)) {
+ for(String temp : urls) {
+ handledUrls.add(urlpre+temp);
+ }
+ }
+ page.addTargetRequests(handledUrls);
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+
+ public static void main(String[] args) {
+ //https://blog.csdn.net/bbc2005/article/details/80890829
+ //https://www.cnblogs.com/sunny08/p/8038440.html
+ // System.setProperty("javax.net.debug", "all"); //打印网络连接握手信息
+ Spider.create(new ImgPageProcessor()).addUrl("http://www.mm131.com/xinggan/4170.html").thread(1).run();
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java
index 003c5730d..fa6b6a65b 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java
@@ -37,7 +37,7 @@ public static void main(String[] args) {
//multidownload
List list = new ArrayList();
list.add(String.format(urlTemplate,"风力发电"));
- list.add(String.format(urlTemplate,"太阳能"));
+ // list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate,"地热发电"));
List resultItemses = ooSpider.getAll(list);
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java
index f212628b4..83a91b9a6 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java
@@ -31,7 +31,7 @@ public static class ModelDateStr {
public static class ModelDate {
- @Formatter(value = "yyyyMMdd", formatter = DateFormatter.class)
+ //@Formatter(value = "yyyyMMdd", formatter = DateFormatter.class)
@ExtractBy(value = "//div[@class='date']/text()", notNull = true)
private Date date;
@@ -53,7 +53,7 @@ public static class ModelStringList {
public static class ModelIntList {
- @Formatter(subClazz = Integer.class)
+ //@Formatter(subClazz = Integer.class)
@ExtractBy("//li[@class='numbers']/text()")
private List numbers;
@@ -61,7 +61,7 @@ public static class ModelIntList {
public static class ModelDateList {
- @Formatter(subClazz = Date.class, value = "yyyyMMdd")
+ ///@Formatter(subClazz = Date.class, value = "yyyyMMdd")
@ExtractBy("//li[@class='dates']/text()")
private List dates;
@@ -69,7 +69,7 @@ public static class ModelDateList {
public static class ModelCustomList {
- @Formatter(subClazz = Date.class, value = "yyyyMMdd",formatter = DateFormatter.class)
+ //@Formatter(subClazz = Date.class, value = "yyyyMMdd",formatter = DateFormatter.class)
@ExtractBy("//li[@class='dates']/text()")
private List dates;
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/BaiduPictureDownloadProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/BaiduPictureDownloadProcesser.java
new file mode 100644
index 000000000..a9cdb1e27
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/BaiduPictureDownloadProcesser.java
@@ -0,0 +1,193 @@
+package us.codecraft.webmagic.samples;
+
+import com.alibaba.fastjson.JSONArray;
+import com.alibaba.fastjson.JSONObject;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.Exchanger;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+public class BaiduPictureDownloadProcesser implements PageProcessor {
+ ExecutorService executorService = Executors.newFixedThreadPool(10);
+
+ private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(500).setTimeOut(3 * 60 * 1000)
+ .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
+ .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
+ .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
+ .setCharset("UTF-8");
+ private final static Map KEY_WORDS = new HashMap<>();
+
+ static {
+ //奶制品
+ KEY_WORDS.put("牛奶", "牛奶");
+ KEY_WORDS.put("奶酪", "奶酪");
+ KEY_WORDS.put("酸奶", "酸奶");
+ //哺乳动物肉
+ KEY_WORDS.put("羊肉", "肉");
+ KEY_WORDS.put("牛肉", "肉");
+ KEY_WORDS.put("狗肉", "肉");
+ KEY_WORDS.put("驴肉", "肉");
+ KEY_WORDS.put("猪肉", "肉");
+ //家禽肉
+ KEY_WORDS.put("鸡", "家禽肉");
+ KEY_WORDS.put("鸭", "家禽肉");
+ KEY_WORDS.put("鹅", "家禽肉");
+ //蛋类
+ KEY_WORDS.put("鸡蛋", "蛋");
+ KEY_WORDS.put("鸭蛋", "蛋");
+ KEY_WORDS.put("鸽子蛋", "蛋");
+ //蔬菜
+ KEY_WORDS.put("冬瓜", "冬瓜");
+ KEY_WORDS.put("西红柿", "西红柿");
+ KEY_WORDS.put("苦瓜", "苦瓜");
+ KEY_WORDS.put("青椒", "青椒");
+ KEY_WORDS.put("胡萝卜", "胡萝卜");
+ KEY_WORDS.put("南瓜", "南瓜");
+ KEY_WORDS.put("玉米", "玉米");
+ KEY_WORDS.put("秋葵", "秋葵");
+ KEY_WORDS.put("西兰花", "西兰花");
+ KEY_WORDS.put("生姜", "生姜");
+ //水果
+ KEY_WORDS.put("苹果", "苹果");
+ KEY_WORDS.put("梨", "梨");
+ KEY_WORDS.put("香蕉", "香蕉");
+ KEY_WORDS.put("葡萄", "葡萄");
+ KEY_WORDS.put("榴莲", "榴莲");
+ KEY_WORDS.put("猕猴桃", "猕猴桃");
+ KEY_WORDS.put("哈密瓜", "哈密瓜");
+ KEY_WORDS.put("草莓", "草莓");
+ KEY_WORDS.put("橘子", "橘子");
+ KEY_WORDS.put("菠萝", "菠萝");
+ KEY_WORDS.put("山楂", "山楂");
+ KEY_WORDS.put("桂圆", "桂圆");
+ //水产品
+ KEY_WORDS.put("虾", "虾");
+ KEY_WORDS.put("蟹", "蟹");
+ KEY_WORDS.put("鱼", "鱼");
+ KEY_WORDS.put("贝类", "贝类");
+ KEY_WORDS.put("螺类", "螺类");
+ KEY_WORDS.put("海参类", "海参类");
+ //
+ KEY_WORDS.put("豆皮", "豆皮");
+ KEY_WORDS.put("豆腐脑", "豆腐脑");
+ KEY_WORDS.put("豆干", "豆干");
+ KEY_WORDS.put("豆腐", "豆腐");
+ //坚果
+ KEY_WORDS.put("腰果", "腰果");
+ KEY_WORDS.put("开心果", "开心果");
+ KEY_WORDS.put("核桃", "核桃");
+ KEY_WORDS.put("葡萄干", "葡萄干");
+ KEY_WORDS.put("夏威夷果", "夏威夷果");
+ }
+
+ @Override
+ public void process(Page page) {
+ List url_list = new ArrayList<>();
+ List name_list = new ArrayList<>();
+ JSONObject jsonObject = (JSONObject) JSONObject.parse(page.getRawText());
+ JSONArray data = (JSONArray) jsonObject.get("data");
+ for (int i = 0; i < data.size(); i++) {
+ String url = (String) data.getJSONObject(i).get("thumbURL");
+ String name = (String) data.getJSONObject(i).get("fromPageTitleEnc");
+ if (url != null) {
+ url_list.add(url);
+ name_list.add(name);
+ }
+ }
+ setUrls(url_list);
+ setNames(name_list);
+
+ }
+
+ @Override
+ public Site getSite() {
+ return this.site;
+ }
+
+ private void downloadPicture(List urlList, String key, String keyName) {
+ URL url = null;
+ for (int i = 0; i < urlList.size(); i++) {
+ try {
+ url = new URL(urlList.get(i));
+ DataInputStream dataInputStream = new DataInputStream(url.openStream());
+ String imageName = i + ".jpg";
+ createDir("d:\\pic\\" + keyName);
+ File file = new File("d:\\pic\\" + keyName); //设置下载路径
+ if (!file.isDirectory()) {
+ file.mkdirs();
+ }
+ FileOutputStream fileOutputStream = new FileOutputStream(new File("d:\\pic\\" + keyName + "\\" + imageName.trim()));
+ byte[] buffer = new byte[1024];
+ int length;
+ while ((length = dataInputStream.read(buffer)) > 0) {
+ fileOutputStream.write(buffer, 0, length);
+ }
+ dataInputStream.close();
+ fileOutputStream.close();
+ } catch (MalformedURLException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ public static void main(String[] args) {
+
+ BaiduPictureDownloadProcesser downloadPicture = new BaiduPictureDownloadProcesser();
+ for (Map.Entry entry : KEY_WORDS.entrySet()) {
+ List urlList = new CopyOnWriteArrayList();
+ for (int i = 0; i < 2; i++) { //控制爬取页数,一页10张图片
+ String url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&queryWord=" + entry.getKey() + "&word=" + entry.getKey() + "&pn=" + i * 10 + "0";
+ Spider.create(new BaiduPictureDownloadProcesser())
+ .addUrl(url)
+ .run();
+ urlList.addAll(urls);
+ }
+ downloadPicture.downloadPicture(urlList, entry.getKey(), entry.getValue());
+ }
+
+ }
+
+ static List urls;
+ static List names;
+
+ public void setUrls(List urls) {
+ this.urls = urls;
+ }
+
+ public void setNames(List names) {
+ this.names = names;
+ }
+
+ /**
+ * 创建文件夹
+ *
+ * @param dir 当前文件夹
+ */
+ private void createDir(String dir) throws IOException {
+ Path path = Paths.get(dir);
+ if (!Files.exists(path)) {
+ Files.createDirectories(path);
+ }
+
+ }
+}
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java
index 7c6192692..68a6fedc6 100644
--- a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java
+++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java
@@ -21,7 +21,8 @@ public void test() {
Page page = new Page();
page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
- page.setHtml(new Html(html));
+ //page.setHtml(new Html(html));
+ page.setHtml(new Html(""));
long time = System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
modelPageProcessor.process(page);
@@ -34,6 +35,7 @@ public void test() {
System.out.println(System.currentTimeMillis() - time);
}
+/*
private String html = "\n" +
"\n" +
"\n" +
@@ -888,4 +890,5 @@ public void test() {
"\n" +
"\n" +
"";
+*/
}