• Java爬虫-简单解析网页内容


    获取百度新闻中所有的中国新闻的标题时间来源

     1     获取网页
     2 public static String getContent(String str) throws ClientProtocolException, IOException {
     3         CloseableHttpClient closeableHttpClient=HttpClients.createDefault(); //创建实例
     4         HttpGet httpGet=new HttpGet(str);
     5         CloseableHttpResponse closeableHttpResponse=closeableHttpClient.execute(httpGet); //执行--返回
     6         HttpEntity httpEntity=closeableHttpResponse.getEntity(); //获取实体
     7         String content=EntityUtils.toString(httpEntity, "utf-8");
     8         closeableHttpResponse.close();
     9         closeableHttpClient.close();
    10         return content;
    11     }
    12   =======    ======    =======    ========
    13 筛选所有符合要求的链接
    14     public static ArrayList<String> getUrl(String str,String strr) {
    15         Document doc=Jsoup.parse(str);
    16         Elements elements =doc.select("a[href]"); //获取a标签
    17         ArrayList<String> strs=new ArrayList<String>();
    18         for(Element e:elements) {
    19             String urls=e.attr("abs:href");
    20             if(urls.startsWith(strr)) {
    21                 strs.add(urls);
    22             }
    23         }
    24         return strs;
    25     }
    26     

    测试解析

    public class BaiduDemo {
    
    	public static void main(String[] args) {
    		// TODO Auto-generated method stub
    		String str="http://news.baidu.com";
    		try {
    			String content=GetUtil.getContent(str);
    			ArrayList<String> list=GetUtil.getUrl(content, "https://kandian.youth.cn/");
    			for(String s:list) {
    				System.out.println(s);
    				String strr=GetUtil.getContent(s);
    				Document doc=Jsoup.parse(strr);
    				Elements ele1=doc.select("div[class=J-title_detail title_detail] h1");
    				Elements ele=doc.select("div[class=J-title_detail title_detail]");
    				if(ele!=null) {
    					System.out.println("标题: "+ele1.text());
    					Elements eles=ele.select("div[class=fl] i");
    					System.out.println("发帖时间: "+eles.text());
    					Elements eless=ele.select("div[class=fl] a");
    					System.out.println("发帖来源: "+eless.text());
    				}
    			}
    		} catch (ClientProtocolException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    
    }
    

     另一方式获取

    public static void main(String[] args) {
    		// TODO Auto-generated method stub
    		try {
    			String str=GetUtil.getContent("http://sports.163.com/18/0207/09/DA1HPMLI00058781.html");
    			//System.out.println(str);
    			Document doc=Jsoup.parse(str);
    			Element element=doc.getElementById("epContentLeft");
    			Elements elements=element.getElementsByTag("h1");
    			System.out.println("标题: "+elements.text());
    			
    			Elements elementss=doc.getElementsByClass("post_time_source");
    			System.out.println("发帖时间: "+elementss.text().substring(0,19));
    			
    			element=doc.getElementById("endText");
    			System.out.println("正文:");
    			System.out.println(element.text());
    
    			elementss=doc.getElementsByClass("ep-source cDGray");
    			System.out.println(elementss.text());
    			//抓取评论
    			elementss=doc.getElementsByClass("tie-cnt");
    			//tie-cnt
    			System.out.println("跟帖 :"+elementss.text());
    		} catch (ClientProtocolException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    		
    	}
    
    }
    
  • 相关阅读:
    平均值滤波之经典形式改进
    Matlab编程实例(4) 相位角与相关系数曲线
    Matlab编程实例(3) 函数向左或向右平移N点 左移右移
    Matlab编程实例(2) 同期平均
    Matlab编程实例(1) 移动平均
    使用js在网页上记录鼠标划圈的小程序
    《你不知道的JavaScript》整理(五)——值与原生函数
    Vuex 学习总结
    HTML移动端开发常见的兼容性总结
    一步一步实现字母索引导航栏
  • 原文地址:https://www.cnblogs.com/xiandong/p/8430136.html
Copyright © 2020-2023  润新知