HtmlUnit WebClient 超时答案

【问题标题】：HtmlUnit WebClient TimeoutHtmlUnit WebClient 超时
【发布时间】：2013-01-11 15:45:12
【问题描述】：

在我之前关于 HtmlUnit 的问题中 Skip particular Javascript execution in HTML unit 和 Fetch Page source using HtmlUnit : URL got stuck

我曾提到 URL 卡住了。我还发现它被卡住了，因为 HtmlUnit 库中的一种方法（解析）没有退出执行。

我在这方面做了进一步的工作。如果需要超过指定的超时秒数才能完成，我编写了代码以退出该方法。

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class HandleHtmlUnitTimeout {

public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException, InterruptedException, TimeoutException 
    {   
        Date start = new Date();
        String url = "http://ericaweiner.com/collections/";
        doWorkWithTimeout(url, 60);
    }

public static void doWorkWithTimeout(final String url, long timeoutSecs) throws InterruptedException, TimeoutException {
    //maintains a thread for executing the doWork method
    ExecutorService executor = Executors.newFixedThreadPool(1);
    //logger.info("Starting method with "+timeoutSecs+" seconds as timeout");
    //set the executor thread working

    final Future<?> future = executor.submit(new Runnable() {
        public void run() 
            {
            try 
                {
                getPageSource(url);
                }
            catch (Exception e) 
                {
                throw new RuntimeException(e);
                }
        }
    });

    //check the outcome of the executor thread and limit the time allowed for it to complete
    try {
        future.get(timeoutSecs, TimeUnit.SECONDS);
    } catch (Exception e) {
        //ExecutionException: deliverer threw exception
        //TimeoutException: didn't complete within downloadTimeoutSecs
        //InterruptedException: the executor thread was interrupted

        //interrupts the worker thread if necessary
        future.cancel(true);

        //logger.warn("encountered problem while doing some work", e);
        throw new TimeoutException();
    }finally{ 
    executor.shutdownNow();
    }
}

public static void getPageSource(String productPageUrl)
    {
    try {
    if(productPageUrl == null)
        {
        productPageUrl = "http://ericaweiner.com/collections/";
        }   

        WebClient wb = new WebClient(BrowserVersion.FIREFOX_3_6);
        wb.getOptions().setTimeout(120000);
        wb.getOptions().setJavaScriptEnabled(true);
        wb.getOptions().setThrowExceptionOnScriptError(true);
        wb.getOptions().setThrowExceptionOnFailingStatusCode(false);
        HtmlPage page = wb.getPage(productPageUrl);
        wb.waitForBackgroundJavaScript(4000);
        wb.closeAllWindows();
} 
catch (FailingHttpStatusCodeException e) 
    {
    e.printStackTrace();
    } 
catch (MalformedURLException e) 
    {
    e.printStackTrace();
    } 
catch (IOException e) 
    {
    e.printStackTrace();
    }
    }

}

这段代码确实来自 doWorkWithTimeout(url, 60);方法。但这并没有终止。

当我尝试使用以下代码调用类似的实现时：

import java.util.Date;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import org.apache.log4j.Logger;


public class HandleScraperTimeOut {

private static Logger logger = Logger.getLogger(HandleScraperTimeOut .class);


public void doWork() throws InterruptedException {
    logger.info(new Date()+ "Starting worker method ");
    Thread.sleep(20000);
    logger.info(new Date()+ "Ending worker method ");
    //perform some long running task here...
}

public void doWorkWithTimeout(int timeoutSecs) {
    //maintains a thread for executing the doWork method
    ExecutorService executor = Executors.newFixedThreadPool(1);
    logger.info("Starting method with "+timeoutSecs+" seconds as timeout");
    //set the executor thread working

    final Future<?> future = executor.submit(new Runnable() {
        public void run() 
            {
            try 
                {
                doWork();
                }
            catch (Exception e) 
                {
                throw new RuntimeException(e);
                }
        }
    });

    //check the outcome of the executor thread and limit the time allowed for it to complete
    try {
        future.get(timeoutSecs, TimeUnit.SECONDS);
    } catch (Exception e) {
        //ExecutionException: deliverer threw exception
        //TimeoutException: didn't complete within downloadTimeoutSecs
        //InterruptedException: the executor thread was interrupted

        //interrupts the worker thread if necessary
        future.cancel(true);

        logger.warn("encountered problem while doing some work", e);
    }
    executor.shutdown();
}

public static void main(String a[])
    {
        HandleScraperTimeOut hcto = new HandleScraperTimeOut ();
        hcto.doWorkWithTimeout(30);

    }

}

如果有人可以看看并告诉我问题是什么，那将非常有帮助。

有关问题的更多详细信息，您可以查看Skip particular Javascript execution in HTML unit 和 Fetch Page source using HtmlUnit : URL got stuck

更新 1 奇怪的是：future.cancel(true);在这两种情况下都返回 TRUE。我的预期是：

对于 HtmlUnit，它应该返回 FALSE，因为进程仍处于挂起状态。
正常的 Thread.sleep();它应该返回 TRUE，因为该过程已成功取消。

更新 2 它只与http://ericaweiner.com/collections/ URL 挂起。如果我提供任何其他 URL，即 http://www.google.com ， http://www.yahoo.com ，它不会手。在这些情况下，它会抛出 IntruptedException 并退出进程。

似乎http://ericaweiner.com/collections/ 页面源包含某些导致问题的元素。

【问题讨论】：

future.cancel 返回 false 吗？
isFuture 对于这两种情况都是 TRUE。据我所知，HtmlUnit webclient 没有响应或没有被这种方法取消。
感谢@金田的回复

标签： java multithreading timeout web-scraping htmlunit

【解决方案1】：

Future.cancel(boolean) 返回：

如果任务无法取消，则为 false，通常是因为它已经正常完成
否则为真

Cancelled 表示线程在取消之前没有完成，取消标志设置为 true，如果请求线程被中断。

中断线程意味着它称为 Thread.interrupt 仅此而已。 Future.cancel(boolean) 不检查线程是否真的停止了。

所以在这种情况下取消返回 true 是正确的。

中断线程意味着它应该尽快停止，但不是强制执行的。您可以尝试使其停止/无法关闭它需要的资源或其他东西。我通常通过从套接字读取（等待传入数据）的线程来做到这一点。我关闭了套接字，让它停止等待。

【讨论】：