• 爬虫任务一:使用httpclient去爬取百度新闻首页的新闻标题和url,编码是utf-8


    第一个入手的爬虫小任务:

    maven工程

    <project xmlns="http://maven.apache.org/POM/4.0.0"
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
        <groupId>com.zhaowu</groupId>
        <artifactId>pachong01</artifactId>
        <version>0.0.1-SNAPSHOT</version>
        <dependencies>
            <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
            <dependency>
                <groupId>org.apache.httpcomponents</groupId>
                <artifactId>httpclient</artifactId>
                <version>4.5.3</version>
            </dependency>
    
            <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
            <dependency>
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>1.11.2</version>
            </dependency>
    
            <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
            <dependency>
                <groupId>commons-io</groupId>
                <artifactId>commons-io</artifactId>
                <version>2.6</version>
            </dependency>
    
    
        </dependencies>
    </project>

    代码实现:

    package com.zhaowu.renwu1;
    
    import java.io.IOException;
    
    import org.apache.http.HttpEntity;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class News {
        public static void main(String[] args) throws ClientProtocolException, IOException {
            // 创建HttpClient实例
            CloseableHttpClient httpClient = HttpClients.createDefault();
            // 创建httpget实例
            HttpGet httpGet = new HttpGet("https://news.baidu.com/");
            
            RequestConfig config = RequestConfig.custom()
                    .setConnectTimeout(10000)//设置连接超时时间10秒钟,单位毫秒
                    .setSocketTimeout(10000) //设置读取超时时间10秒钟
                    .build();
            httpGet.setConfig(config);
            // 设置请求头消息User-Agent模拟浏览器
            httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/59.0");
            // 执行get请求
            CloseableHttpResponse response = httpClient.execute(httpGet);
            // 获取返回实体
            HttpEntity entity = response.getEntity();
            // 实体的内容(编码格式为utf-8)
            String content = EntityUtils.toString(entity, "utf-8");
            // System.out.println("网页内容为: " + content);
    
            // 解析网页 得到文档对象
            Document doc = Jsoup.parse(content);    
            
            Elements hrefElements = doc.select("a[href]");// 选择所有的a元素
            for (Element e : hrefElements) {
                System.out.println("新闻标题:" + e.text());
                System.out.println("新闻地址:" + e.attr("href"));
                System.out.println("------------------------");
            }
            
        }
    }
  • 相关阅读:
    浅析linux 下的/etc/profile、/etc/bashrc、~/.bash_profile、~/.bashrc(转)
    【引用】如何关闭SELinux
    typedef 用法(转)
    【引用】让source insight在窗口标题栏上显示文件全路径
    c语言 typedef(转)
    ip分片 tcp分段(转)
    linux 命令 pushd popd cd 区别
    linux xargs
    JS实现简单hashtable
    Page.ClientScript.RegisterClientScriptBlock 与RegisterClientScriptBlock
  • 原文地址:https://www.cnblogs.com/sutao/p/9012393.html
Copyright © 2020-2023  润新知