java中使用HtmlUnit爬虫

2019-04-13 21:32发布

1.pom文件中添加依赖 net.sourceforge.htmlunit htmlunit 2.27 2.写一个获取页面的util方法 public HtmlPage getHtmlPageResponse(WebClient webClient,String url) throws Exception { webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常 webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常 webClient.getOptions().setActiveXNative(false); webClient.getOptions().setCssEnabled(true);//是否启用CSS webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS webClient.getOptions().setRedirectEnabled(true); webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX webClient.getCookieManager().setCookiesEnabled(true); webClient.getOptions().setTimeout(timeout);//设置“浏览器”的请求超时时间 webClient.setJavaScriptTimeout(timeout);//设置JS执行的超时时间 HtmlPage page; try { page = webClient.getPage(url); } catch (Exception e) { webClient.close(); throw e; } webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//该方法阻塞线程 return page; } 3.调用util方法 WebClient webClient = new WebClient(); HtmlPage htmlPage = httpUtils.getHtmlPageResponse(webClient,infoSource.getSourceUrl()); Document document = Jsoup.parse(htmlPage.asXml());//获取html文档 //处理document获取需要的内容