目录
1.jar包----jsoup
2.解析步骤(爬去51job网页信息)
一、jar包----jsoup
jsoup包是开源的html解析工具包
jsoup包下载链接http://www.mvnjar.com/org.jsoup/jsoup/1.11.3/detail.html
二、解析步骤(爬取51job网页信息)
1.打开需要爬取的网页
2.通过链接对象获取文档对象
3.通过jsoup中的select()解析文档对象
4.创建一个实体类,内容包括需要爬取的信息
5.将select查找出来的信息放到实体类中,并将这些对象保存到集合数组中
ps:利用jsoup解析html需要遵循html的语法
package com.work.crawler; /** * 工作信息 * @author Hu YS * * 2018年9月1日 */ public class Work implements Comparable<Work>{ private String position;//职位 private String company;//公司 private String place;//工作地点 private String salary;//薪资 private String date;//发布时间 public String getPosition() { return position; } public void setPosition(String position) { this.position = position; } public String getCompany() { return company; } public void setCompany(String company) { this.company = company; } public String getPlace() { return place; } public void setPlace(String place) { this.place = place; } public String getSalary() { return salary; } public void setSalary(String salary) { this.salary = salary; } public String getDate() { return date; } public void setDate(String date) { this.date = date; } @Override public String toString() { return "Work [position=" + position + ", company=" + company + ", place=" + place + ", salary=" + salary + ", date=" + date + "]"; } /** * 排序规则 */ @Override public int compareTo(Work o) { int i1 = Integer.parseInt(this.getDate().substring(0, 2)); int i2 = Integer.parseInt(this.getDate().substring(3, 5)); int o1 = Integer.parseInt(o.getDate().substring(0, 2)); int o2 = Integer.parseInt(o.getDate().substring(3, 5)); if(i1>=i2) { if(o1>o2) { return -1; } else if(o1<o2){ return 1; }else { return 0; } }else if(i1<i2){ return 1; } return 0; } }
package com.work.crawler; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import net.sf.json.JSONArray; /** * 网页爬取线程 * @author Administrator * */ public class Crawler implements Runnable { //打开地址 private String url; //保存集合 private List<Work> list; public Crawler(String url,List<Work> list) { this.list=list; this.url=url; } @Override public void run() { try { //链接网页获取document文档 Document doc = Jsoup.connect(url).timeout(5000).get(); //解析有效内容 Elements eles = doc.select(".dw_table .el:gt(2)"); for (Element element : eles) { Work work = new Work(); String position = element.select(".t1 span a").text(); String company =element.select(".t2 a").text(); String place = element.select(".t3").text(); String salary = element.select(".t4").text(); String date = element.select(".t5").text(); work.setCompany(company); work.setDate(date); work.setPlace(place); work.setSalary(salary); work.setPosition(position); System.out.println(work); list.add(work); } } catch (IOException e) { e.printStackTrace(); } } /** * 将爬取后的信息转换成json文件并写入本地文件 * @param list 需要转换的list数组 */ public static void save(List<Work> list) { BufferedWriter bw=null; try { //文件以追加的形式写入json文件 bw = new BufferedWriter(new FileWriter("E:\用户\Desktop\目标\1.json",true)) ; //将整个 JSONArray fromObject = JSONArray.fromObject(list); bw.write(fromObject.toString()); bw.newLine(); bw.flush(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { try { bw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
1 package com.work.crawler; 2 3 import java.util.ArrayList; 4 import java.util.List; 5 import java.util.concurrent.ExecutorService; 6 import java.util.concurrent.Executors; 7 /** 8 * 利用线程池爬取网页内容 9 * @author Administrator 10 * 11 */ 12 public class Main { 13 static List<Work> list = new ArrayList<>(); 14 public static void main(String[] args) { 15 long s1 = System.currentTimeMillis(); 16 int count = 1; 17 //创建线程池 18 ExecutorService es = Executors.newCachedThreadPool(); 19 while(true) { 20 //当爬取到了150页的时候停止爬取 21 if(count == 150) { 22 break; 23 } 24 //爬取线程的url 25 String url = "https://search.51job.com/list/010000,000000,0000,00,9,99,Java%2B%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,"+count+".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="; 26 count+=1; 27 //执行线程池的线程(url=路径,list=保存的数组) 28 es.execute(new Crawler(url, list)); 29 } 30 //关闭线程池 31 es.shutdown(); 32 //不断询问线程池是否关闭 33 while(true) { 34 //当线程池关闭保存到本地 35 if(es.isTerminated()) { 36 Crawler.save(list); 37 System.out.println("over"); 38 break; 39 } 40 } 41 long s2 = System.currentTimeMillis(); 42 System.out.println(s2-s1); 43 } 44 }