• JAVA实现网页抓取(htmlunit)


    准确条件

    加入依赖jar包

    <dependency>
         <groupId>net.sourceforge.htmlunit</groupId>
          <artifactId>htmlunit</artifactId>
          <version>2.15</version>
    </dependency>


    代码示例

    private WebClient initWc() throws IOException {
        WebClient wc = new WebClient(BrowserVersion.CHROME);
        wc.getOptions().setJavaScriptEnabled(false);
        wc.getOptions().setCssEnabled(false);
        wc.getOptions().setTimeout(8000);
        wc.setJavaScriptTimeout(8000);
        wc.setAjaxController(new NicelyResynchronizingAjaxController());
        wc.waitForBackgroundJavaScript(8000);
    //        Cache cache=new Cache();
    //        wc.setCache(cache);
        wc.getOptions().setThrowExceptionOnScriptError(false);
    //        wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
        return wc;
    }

    public void loadData() {
      WebClient wc = null;

        if ( wc == null ) {
            try {
                wc = initWc();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        try {
            //图片中文字解析时使用
            IIORegistry registry = IIORegistry.getDefaultInstance();  
            registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageWriterSpi());  
            registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageReaderSpi());  

            StringBuffer errPage =new StringBuffer();
            for(int i =1 ; i<=97;i++){
                loadPage(i,errPage,wc);
                riskCompanyDao.flush();
            }
            log.info("errPage:"+errPage);
    //            loadPage(27,errPage,wc);
        } catch (Exception e) {
            log.warn("loadData error! ", e);
        } finally {
            wc.closeAllWindows();
        }
    }

    private void loadPage(int pageNo,StringBuffer errPage, WebClient wc){
       HtmlPage page;
        try {
            String refer="http://www.baidu.com/";
            URL link=new URL("http://www.kstba.org/minglu-79-"+pageNo+".html");
            WebRequest request=new WebRequest(link);
            request.setCharset("UTF-8");
            request.setAdditionalHeader("Referer", refer);//设置请求报文头里的refer字段
            设置请求报文头里的User-Agent字段
            request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36");
            request.setAdditionalHeader("Connection", "keep-alive");
            request.setAdditionalHeader("Cookie", "ad_play_index=47; CNZZDATA1000215585=2014872656-1449554771-%7C1449572770");

            page = wc.getPage(request);

            HtmlPage pageResult = page;
            HtmlTable tableResult = (HtmlTable) pageResult.getElementsByTagName("table").get(0);
            HtmlTableBody body = (HtmlTableBody) tableResult.getChildNodes().get(1);
            int indexRow = 0;
            for ( DomNode node2 : body.getChildNodes() ) {

                if (node2 instanceof  HtmlTableRow ) {
                    HtmlTableRow row = (HtmlTableRow) node2;
                    List<HtmlTableCell> cells = row.getCells();
                    HtmlTableCell cell0=cells.get(0);
                    String companyName = cell0.getElementsByTagName("a").get(0).getTextContent();
                    String industryName = cell0.getElementsByTagName("div").get(0).getTextContent();
                    industryName = industryName.split(":")[1];
                    String addr = cell0.getElementsByTagName("div").get(1).getTextContent();
                    if (addr.split(":").length>1){
                        addr = addr.split(":")[1];
                    }else{
                        addr=null;
                    }
                    String mobile =null;
                    if (cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").size()>0){
                        HtmlImage img =(HtmlImage)cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").get(0);
                        String imgStr =img.getAttribute("src");
                        imgStr =imgStr.substring(0,imgStr.indexOf("&font=")).replace("fontsize=12", "fontsize=22");
                        mobile = ImageRead.getImgStr(imgStr);
                        log.info("mobile:"+mobile);
                    }
                   
                }
                indexRow++;
            }

        } catch (Exception e) {
            errPage.append(pageNo).append(",");
            log.warn("page error :"+pageNo,e);
        }

    }


    注意事项

        普通的httpConnection容易被拦截,需设置请求报文头,模拟浏览器请求
        WebClient在请求发起前初始化一次即可
        不同浏览器版返回的html代码有一定差异,需单独调试

  • 相关阅读:
    linux杂记
    mysql 备份命令
    查看mysql 套接字文件 mysql.sock
    处理下载是文件名乱码正确姿势
    对象创建完成后进行数据同步
    python——数据库编程
    python——网络编程2
    python——网络编程
    python——12、面向对象
    pythoning——11、正则匹配
  • 原文地址:https://www.cnblogs.com/cuihongyu3503319/p/15047161.html
Copyright © 2020-2023  润新知