-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? # to your account
更新内容 #944
base: master
Are you sure you want to change the base?
更新内容 #944
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,6 +32,7 @@ | |
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; | ||
import org.apache.http.impl.cookie.BasicClientCookie; | ||
import org.apache.http.protocol.HttpContext; | ||
import org.apache.http.ssl.SSLContexts; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
|
@@ -142,7 +143,15 @@ public void process( | |
connectionManager.setDefaultSocketConfig(socketConfig); | ||
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); | ||
generateCookie(httpClientBuilder, site); | ||
return httpClientBuilder.build(); | ||
SSLContext ctx = null; | ||
try { | ||
ctx = SSLContexts.custom().useProtocol("TLSv1.2").build(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 为什么要强制使用 TLSv1.2?构造函数里不是已经注册了 SSLConnectionSocketFactory 了吗? |
||
} catch (NoSuchAlgorithmException e) { | ||
logger.warn("CloseableHttpClient getClient #NoSuchAlgorithmException,{}", e); | ||
} catch (KeyManagementException e) { | ||
logger.warn("CloseableHttpClient getClient #NoSuchAlgorithmException ,{} error", e); | ||
} | ||
return httpClientBuilder.create().setSSLContext(ctx).build(); | ||
} | ||
|
||
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
package us.codecraft.webmagic.pipeline; | ||
|
||
import net.jcip.annotations.ThreadSafe; | ||
import org.apache.commons.codec.digest.DigestUtils; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
@@ -19,7 +20,9 @@ | |
* @author code4crafter@gmail.com <br> | ||
* @since 0.1.0 | ||
*/ | ||
public class FilePipeline extends FilePersistentBase implements Pipeline { | ||
@ThreadSafe | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个 annotation 在哪里会用到? |
||
public class FilePipeline extends FilePersistentBase implements Pipeline { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. format 不正确。 |
||
|
||
|
||
private Logger logger = LoggerFactory.getLogger(getClass()); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
package us.codecraft.webmagic.processor.example; | ||
|
||
|
||
import java.io.*; | ||
import java.net.HttpURLConnection; | ||
import java.net.URL; | ||
|
||
/** | ||
* Created by wpx on 2018/7/17. | ||
*/ | ||
public class DownImgUtil { | ||
/** | ||
* 从网络Url中下载文件 | ||
* @param urlStr | ||
* @param fileName | ||
* @param savePath | ||
* @throws IOException | ||
*/ | ||
public static void downLoadFromUrl(String urlStr,String fileName,String savePath) throws IOException{ | ||
System.setProperty("javax.net.debug", "all"); | ||
|
||
URL url = new URL(urlStr); | ||
HttpURLConnection conn = (HttpURLConnection)url.openConnection(); | ||
conn.setRequestMethod("GET"); | ||
//设置超时间为3秒 | ||
conn.setConnectTimeout(3*1000); | ||
//防止屏蔽程序抓取而返回403错误 | ||
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); | ||
|
||
//得到输入流 | ||
InputStream inputStream = conn.getInputStream(); | ||
//获取自己数组 | ||
byte[] getData = readInputStream(inputStream); | ||
|
||
//文件保存位置 | ||
File saveDir = new File(savePath); | ||
if(!saveDir.exists()){ | ||
saveDir.mkdir(); | ||
} | ||
File file = new File(saveDir+File.separator+fileName); | ||
FileOutputStream fos = new FileOutputStream(file); | ||
fos.write(getData); | ||
if(fos!=null){ | ||
fos.close(); | ||
} | ||
if(inputStream!=null){ | ||
inputStream.close(); | ||
} | ||
|
||
|
||
System.out.println("info:"+url+" download success"); | ||
|
||
} | ||
|
||
|
||
|
||
/** | ||
* 从输入流中获取字节数组 | ||
* @param inputStream | ||
* @return | ||
* @throws IOException | ||
*/ | ||
public static byte[] readInputStream(InputStream inputStream) throws IOException { | ||
byte[] buffer = new byte[1024]; | ||
int len = 0; | ||
ByteArrayOutputStream bos = new ByteArrayOutputStream(); | ||
while((len = inputStream.read(buffer)) != -1) { | ||
bos.write(buffer, 0, len); | ||
} | ||
bos.close(); | ||
return bos.toByteArray(); | ||
} | ||
|
||
public static void main(String[] args) { | ||
try{ | ||
downLoadFromUrl("http://img1.mm131.me/pic/4170/15.jpg", "百度.jpg","D:\\webimg"); | ||
}catch (Exception e) { | ||
// TODO: handle exception | ||
System.out.println(""); | ||
} | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
package us.codecraft.webmagic.processor.example; | ||
|
||
import org.apache.commons.collections.CollectionUtils; | ||
import us.codecraft.webmagic.Page; | ||
import us.codecraft.webmagic.Site; | ||
import us.codecraft.webmagic.Spider; | ||
import us.codecraft.webmagic.processor.PageProcessor; | ||
|
||
import java.io.IOException; | ||
import java.net.MalformedURLException; | ||
import java.net.URL; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* 这部分我们直接通过GithubRepoPageProcessor这个例子来介绍PageProcessor的编写方式。 | ||
* 我将PageProcessor的定制分为三个部分,分别是爬虫的配置、页面元素的抽取和链接的发现。 | ||
* | ||
* @author code4crafter@gmail.com <br> | ||
* @since 0.3.2 | ||
*/ | ||
public class ImgPageProcessor implements PageProcessor { | ||
// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 | ||
|
||
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000); | ||
|
||
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | ||
@Override | ||
public void process(Page page) { | ||
// 部分二:定义如何抽取页面信息,并保存下来 | ||
String imgName = page.getHtml().xpath("//div[@class='content-pic']/a/img//@alt").toString(); | ||
String imgUrl = page.getHtml().xpath("//div[@class='content-pic']/a/img//@src").toString(); | ||
String urlpre = ""; | ||
URL url = null; | ||
try { | ||
url = new URL(page.getUrl().toString()); | ||
System.out.println(""); | ||
urlpre = url.getProtocol()+"://" + url.getHost() +"/"+ url.getPath().split("/")[1]; | ||
} catch (MalformedURLException e) { | ||
e.printStackTrace(); | ||
} | ||
if(null!=imgUrl && ""!= imgUrl && ""!=imgName){ | ||
try { | ||
DownImgUtil.downLoadFromUrl(imgUrl, imgName,""); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} | ||
} | ||
|
||
|
||
// 部分三:从页面发现后续的url地址来抓取 | ||
/** | ||
* page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()用于获取所有满足"(https:/ /github\.com/\w+/\w+)" | ||
* 这个正则表达式的链接,page.addTargetRequests()则将这些链接加入到待抓取的队列中去。 | ||
*/ | ||
List<String> urls = page.getHtml().links().xpath("/html/body/div[6]/div[3]//@href").all(); | ||
List<String> handledUrls = new ArrayList<String>(); | ||
if(CollectionUtils.isNotEmpty(urls)) { | ||
for(String temp : urls) { | ||
handledUrls.add(urlpre+temp); | ||
} | ||
} | ||
page.addTargetRequests(handledUrls); | ||
} | ||
|
||
@Override | ||
public Site getSite() { | ||
return site; | ||
} | ||
|
||
public static void main(String[] args) { | ||
//https://blog.csdn.net/bbc2005/article/details/80890829 | ||
//https://www.cnblogs.com/sunny08/p/8038440.html | ||
// System.setProperty("javax.net.debug", "all"); //打印网络连接握手信息 | ||
Spider.create(new ImgPageProcessor()).addUrl("http://www.mm131.com/xinggan/4170.html").thread(1).run(); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这个修改是没有必要的。