第一个入手的爬虫小任务:
maven工程
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.zhaowu</groupId> <artifactId>pachong01</artifactId> <version>0.0.1-SNAPSHOT</version> <dependencies> <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient --> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.3</version> </dependency> <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.2</version> </dependency> <!-- https://mvnrepository.com/artifact/commons-io/commons-io --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.6</version> </dependency> </dependencies> </project>
代码实现:
package com.zhaowu.renwu1; import java.io.IOException; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class News { public static void main(String[] args) throws ClientProtocolException, IOException { // 创建HttpClient实例 CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建httpget实例 HttpGet httpGet = new HttpGet("https://news.baidu.com/"); RequestConfig config = RequestConfig.custom() .setConnectTimeout(10000)//设置连接超时时间10秒钟,单位毫秒 .setSocketTimeout(10000) //设置读取超时时间10秒钟 .build(); httpGet.setConfig(config); // 设置请求头消息User-Agent模拟浏览器 httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/59.0"); // 执行get请求 CloseableHttpResponse response = httpClient.execute(httpGet); // 获取返回实体 HttpEntity entity = response.getEntity(); // 实体的内容(编码格式为utf-8) String content = EntityUtils.toString(entity, "utf-8"); // System.out.println("网页内容为: " + content); // 解析网页 得到文档对象 Document doc = Jsoup.parse(content); Elements hrefElements = doc.select("a[href]");// 选择所有的a元素 for (Element e : hrefElements) { System.out.println("新闻标题:" + e.text()); System.out.println("新闻地址:" + e.attr("href")); System.out.println("------------------------"); } } }