• jsoup爬虫,项目实战,欢迎收看


    import com.mongodb.BasicDBObject
    import com.mongodb.DBCollection
    import org.jsoup.Jsoup
    import org.jsoup.nodes.Document
    import org.jsoup.nodes.Element
    import org.jsoup.select.Elements
    
    public class ZhongYuan {
        public static final DBCollection test = MongoUtils.getCollectionByName("name", "table", 
    "port")
        public static final DBCollection html = MongoUtils.getCollectionByName("name", "table", 
    "port")
    
        public static void main(String[] args){
    //        循环遍历页面进行数据爬去
            for(int i = 500 ; i<598 ;i++) {
                String url = "http://sh.centanet.com/xiaoqu/g"+i+"/";
                String result = RequestUtil.doGet(url, "GBK");
                Document doc = Jsoup.parse(result);
                //页面加载完成后对document进行处理,获取自己有用的数据
                parseList(doc);
                System.out.println("page=====>"+i);
            }
        }
        private static void parseList(Document doc){
    
            Elements elements = doc.select("div.house-listBox>div");
            int j = 0;
            for(Element element : elements){
    
                String name = element.select(".house-title a").first().text();
                html.save(new BasicDBObject("name",name).append("html",element.toString()))
                String regionstr = element.select("div>div>p").first().text().replace(' ','-');
                String region = regionstr.split("-")[0];
                String address = null;
                if(regionstr.split("-").length>1) {
                    address = regionstr.split("-")[1] + regionstr.split("-")[2];
                } else {
                    address = regionstr.split("-")[1];
                }
    
                String price = element.select("div>div").last().select("p").first().text();
                test.insert(new BasicDBObject("city","上海").append("region",region).append("name",name)
                        .append("avg_price",price));
                System.out.println(name);
                j++;
            }
            System.out.println(j);
        }
        private static void parseList1(Document doc) {
            Elements elements = doc.select("div.section>ul>li");
            String name = null;
            String region = null;
            String price = null;
            for (Element element : elements) {
                if (element.toString().contains("room-img")) {
                    name = element.select("h5.room-name a").first().text();
                    Elements datas = element.select("p");
                    int i = 0;
                    for (Element data : datas) {
                        i++;
                        if (i == 2) {
                            price = data.text();
                        }
                        if (i == 4) {
                            region = data.text();
                        }
                    }
                    System.out.println(name + price + region);
                    test.insert(new BasicDBObject("city","上海").append("region",region).append("name",name)
                            .append("avg_price",price));
                }
            }
        }
    }

    相关doget请求自己封装了一个util,可以看看,上面的这一句String result = RequestUtil.doGet(url, "GBK");用的就是自己封装的util包,这里也可以使用jsoup自己封装的。

    /**
         * 发送get请求
         * @param url
         * @return
         */
    
        public static String doGet(String url) {
            return doGet(url,  null, "UTF-8", false);
        }
    
        public static String doGet(String url, boolean encodeUrl) {
            return doGet(url,  null, "UTF-8", encodeUrl);
        }
    
        public static String doGet(String url, String charset) {
            return doGet(url,  null, charset, true);
        }
    
        public static String doGet(String url, Map<String, String> headers) {
            return doGet(url, headers, "UTF-8", true);
        }
    
        public static String doGet(final String url, Map<String, String> headers, String charset, boolean encodeUrl) {
            CloseableHttpClient client = HttpClients
                    .custom()
                    .setUserAgent(USERAGENT_CHROME)
                    .build();
    
            CloseableHttpResponse response = null;
            String result = null;
            String requestUrl = url;
            try {
                if(encodeUrl) {
                    requestUrl = encodingUrl(url, charset);
                }
                HttpGet httpGet = new HttpGet(requestUrl);
    //            RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(3000).setConnectTimeout(3000).build();//设置请求和传输超时时间
    //            httpGet.setConfig(requestConfig);
                if(headers != null) {
                    for(Map.Entry<String, String> entry : headers.entrySet()) {
                        httpGet.addHeader(entry.getKey(), entry.getValue());
                    }
                }
                response = client.execute(httpGet);
                int statusCode = response.getStatusLine().getStatusCode();
                if(statusCode == 200) {
                    result = EntityUtils.toString(response.getEntity(), charset);
                }
            } catch (ClientProtocolException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                if(response != null) {
                    try {
                        response.close();
                    } catch (IOException e) {
                    }
                }
                if(client != null) {
                    try {
                        client.close();
                    } catch (IOException e) {
                    }
                }
            }
            return result;
        }
  • 相关阅读:
    Linux 常用命令
    Oracle DG 三种模式(转)
    S5PV2210
    Timer wheel etc.
    SCM etc.
    负载均衡 IO etc.
    Remoting,OData Snippet Compiler等
    displaytag 动态列实现
    <display:column>属性解释
    <display:table>属性解释
  • 原文地址:https://www.cnblogs.com/asd529735325/p/10216040.html
Copyright © 2020-2023  润新知