-
Notifications
You must be signed in to change notification settings - Fork 0
/
WebCrawler.java
139 lines (117 loc) · 3.91 KB
/
WebCrawler.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
package searchengine;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A Web Crawler using Google Search String, Google BOT with Java jsoup Library
* Searches for the keyword on the internet and generates a list of web URL links.
* @author Mike Wu
*
*/
public class WebCrawler {
private String url;
private String keyword;
private Set<String> urls = new HashSet<String>();
private static final String USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
private Document htmlDocument;
private static Pattern patternDomainName;
private Matcher matcher;
private static final String DOMAIN_NAME_PATTERN = "([a-zA-Z0-9]([a-zA-Z0-9\\-]{0,61}[a-zA-Z0-9])?\\.)+[a-zA-Z]{2,6}";
static {
patternDomainName = Pattern.compile(DOMAIN_NAME_PATTERN);
}
// Constructor for the WebCrawler object
// It saved the keyword that user entered and put it into a google search link
WebCrawler(String aKeyword) {
keyword = aKeyword;
url = "https://google.com/search?q=" + aKeyword + "&num=80";
}
// This method start the search
public void search() {
String currentUrl = url;
crawl(currentUrl);
boolean success = searchForWord(keyword);
if (success) {
System.out.println(String.format("**Success** Word %s found at %s", keyword, currentUrl));
}
System.out.println(String.format("**Done** Visited %s web page(s)", urls.size()));
}
// The method will use pattern and matcher to extract the domain
public String getDomainName(String url) {
matcher = patternDomainName.matcher(url);
// ** IF WANT FULL URLs WITHOUT GARBAGE
// modify the url string for a better format
/*
if (url.startsWith("/url?q=https://"))
{
return url.substring(15, url.indexOf("&sa="));
}
else
{
return url.substring(14, 42);
}
*/
// original format
if (url.startsWith("/url?q=https://")) {
return url.substring(15, 42);
}
return url.substring(14, 42);
}
// get the set of urls result
public Set<String> getUrls() {
return this.urls;
}
// This method will crawl the links and put them in to a set to keep
// Give it a URL and it makes an HTTP request for a web page
public boolean crawl(String url) {
try {
Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
final Document htmlDocument = connection.timeout(5000).get();
this.htmlDocument = htmlDocument;
if (connection.response().statusCode() == 200) {
System.out.println("\n**Visiting** Received web page at " + url);
}
if (!connection.response().contentType().contains("text/html")) {
System.out.println("**Failure** Retrieved something other than HTML");
return false;
}
Elements linksOnPage = htmlDocument.select("a[href]");
System.out.println("Found (" + linksOnPage.size() + ") links");
for (Element link : linksOnPage) {
String temp = link.attr("href");
if (temp.startsWith("/url?q=http")) {
this.urls.add(getDomainName(temp));
}
}
return true;
} catch (IOException ioe) {
return false;
}
}
// This method will check if the website contains keyword
public boolean searchForWord(String searchWord) {
if (this.htmlDocument == null) {
System.out.println("Error!");
return false;
}
System.out.println("Searching for the word " + searchWord + "...");
String bodyText = this.htmlDocument.body().text();
return bodyText.toLowerCase().contains(searchWord.toLowerCase());
}
public static void main(String[] args)
{
WebCrawler wc = new WebCrawler("computer");
wc.search();
for(String s : wc.getUrls())
{
System.out.println(s);
}
}
}//end