/**
* @(#)SearchCrawler.java, 2016年4月12日. Copyright 2016 Youdao, Inc. All rights
* reserved. YOUDAO PROPRIETARY/CONFIDENTIAL. Use is
* subject to license terms.
*/
package testZK;
import java.util.*;
import java.net.*;
import java.io.*;
/**
*
* @author zhoukang
*/
public class SearchCrawler extends Thread {
private HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();
private List<String> urlList;
private static File resultFile = new File("result.txt");
private static BufferedWriter writer;
static {
try {
if(!resultFile.exists()) {
resultFile.createNewFile();
}
writer = new BufferedWriter(new FileWriter(resultFile));
} catch (Exception e) {
//TODO:
}
}
public SearchCrawler(String str, boolean file) throws IOException{
urlList = new ArrayList<String>();
if (file) {
File f = null;
BufferedReader reader = null;
try {
f = new File(str);
reader = new BufferedReader(new FileReader(f));
String line = "";
while(line != null) {
line = reader.readLine();
urlList.add(line);
}
} catch (Exception e) {
//TODO
} finally {
if(reader != null) {
reader.close();
}
}
} else {
urlList.add(str);
}
}
public BufferedWriter getBufferedWriter() {
return writer;
}
public void run() {
checkUrl(urlList);
}
private void checkUrl(List<String> urls) {
Iterator<String> urlIter = urls.iterator();
while(urlIter.hasNext()) {
String url = urlIter.next();
if(url == null || url.equals("")) {
continue;
}
url = removeWwwFromUrl(url);
URL verifiedUrl = verifyUrl(url);
System.out.println(url);
try {
if(isRobotAllowed(verifiedUrl)) {
writer.write(url+":true");
} else {
writer.write(url+":false");
}
writer.newLine();
writer.flush();
} catch (Exception e) {
//TODO:
}
}
}
private URL verifyUrl(String url) {
if (!url.toLowerCase().startsWith("http://"))
return null;
URL verifiedUrl = null;
try {
verifiedUrl = new URL(url);
} catch (Exception e) {
return null;
}
return verifiedUrl;
}
private boolean isRobotAllowed(URL urlToCheck) {
String host = urlToCheck.getHost().toLowerCase();
ArrayList<String> disallowList = disallowListCache.get(host);
if (disallowList == null) {
disallowList = new ArrayList<String>();
try {
URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
BufferedReader reader = new BufferedReader(
new InputStreamReader(robotsFileUrl.openStream()));
String line;
while ((line = reader.readLine()) != null) {
if (line.indexOf("Disallow:") == 0) {
String disallowPath = line.substring("Disallow:"
.length());
int commentIndex = disallowPath.indexOf("#");
if (commentIndex != -1) {
disallowPath = disallowPath.substring(0,
commentIndex);
}
disallowPath = disallowPath.trim();
disallowList.add(disallowPath);
}
}
disallowListCache.put(host, disallowList);
} catch (Exception e) {
return true;
}
}
String file = urlToCheck.getFile();
for (int i = 0; i < disallowList.size(); i++) {
String disallow = disallowList.get(i);
if (file.startsWith(disallow)) {
return false;
}
}
return true;
}
private String removeWwwFromUrl(String url) {
int index = url.indexOf("://www.");
if (index != -1) {
return url.substring(0, index + 3) + url.substring(index + 7);
}
return (url);
}
private static void addShutDownHook(final SearchCrawler searchCrawler) {
Runtime.getRuntime().addShutdownHook(new Thread() {
public void run() {
BufferedWriter writer = searchCrawler.getBufferedWriter();
try {
writer.close();
} catch (Exception e) {
//TODO
System.out.println("Add error");
}
}
});
}
public static void main(String[] args) throws InterruptedException, IOException{
if (args.length != 1 && args.length != 2 ) {
System.out
.println("Usage-1:java SearchCrawler url");
System.out
.println("Usage-2:java SearchCrawler -f filename");
return;
}
SearchCrawler crawler = null;
if(args.length == 1) {
crawler = new SearchCrawler(args[0], false);
} else {
crawler = new SearchCrawler(args[1], true);
}
addShutDownHook(crawler);
crawler.setDaemon(true);
crawler.start();
crawler.join();
}
}