待爬取的牛客网的实习信息
https://www.nowcoder.com/job/center
首先在Eclipse新建一个maven项目
1、在maven文件中加入以下的代码
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.wu</groupId> <artifactId>TopEssay</artifactId> <version>0.0.1-SNAPSHOT</version> <dependencies> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> </dependencies> </project>
2、提取所需要的信息
这里编写CSS规则,有点麻烦,我们可以利用游览器自带的工具,帮助我们快速选择所需要的的元素
比如我们这里的标题,通过这种该方法,为 body > div.nk-container > div.nk-main.clearfix > div.nk-content > div > div.module-body > ul > li:nth-child(1) > div > div.reco-job-cont > a
然后我们可以在上面这个基础上进行相应的修改,有效节省了我们的时间。
package com.jsoup; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.entity.JobInfo; public class NiuKeSpider { private static final String url = "https://www.nowcoder.com/job/center"; public static void main(String[] args) { try { // 获取网页的源代码 Document document = Jsoup.connect(url).get(); // 筛选出和职位有关的网页源码 Elements jobs = document.getElementsByClass("reco-job-main"); System.out.println(jobs.size()); List<JobInfo> lists = new ArrayList<>(); //工作描述+公司+地点+工资+url for(Element element : jobs) { JobInfo jobInfo = new JobInfo(); jobInfo.setJobContent(element.getElementsByClass("reco-job-cont").text()); jobInfo.setUrl(element.select("div.reco-job-cont > a").attr("abs:href")); jobInfo.setCompany(element.getElementsByClass("reco-job-com").text()); jobInfo.setAddress(element.getElementsByClass("job-address").text()); jobInfo.setSalary(element.select("div.reco-job-info > div:nth-child(1) > span:nth-child(2)").text().trim()); lists.add(jobInfo); } for(JobInfo job : lists) { System.out.println(job); } } catch (IOException e) { e.printStackTrace(); } } }
3、封装所需的信息
package com.entity; /** * 职位有关的信息 * @author Administrator * */ public class JobInfo { private String jobContent; private String url; private String company; private String address; private String Salary; public String getJobContent() { return jobContent; } public void setJobContent(String jobContent) { this.jobContent = jobContent; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getCompany() { return company; } public void setCompany(String company) { this.company = company; } public String getAddress() { return address; } public void setAddress(String address) { this.address = address; } public String getSalary() { return Salary; } public void setSalary(String salary) { Salary = salary; } @Override public String toString() { return "job [jobContent=" + jobContent + ", url=" + url + ", company=" + company + ", address=" + address + ", Salary=" + Salary + "]"; } }
4、运行结果:
总结: