要求:
1.给定了一个网页网址(URL),这个就是我们爬虫项目的入口网页,从哪开始爬
http://roll.news.sina.com.cn/news/gnxw/gdxw1/index.shtml
2.把当天的新闻内容全部爬取保存到本地文件中
3.方便以后我们可以迅速查找(在本地文件中)某个新闻,供我们做分析使用
这里为了简化,我们的要求就是找到对应的新闻内容打印输出到控制台System.out.println()
4.爬取的数据按天为单位划分目录,一天生成一个文件夹,文件夹下有2个文件,
一个是数据文件(存储爬取的所有新闻),一个是索引文件(存储某个新闻对应的位置,方便我们查找)。
爬取前先分析网页结构,找到自己需要内容的部分,编写正则表达式
一、建立一个maven项目
二、查取网页内容并储存在对应的数据文件和索引文件
1、建立所需工具包
/** * 用于关流 * @author Administrator * */ public class CloseUtil { public static void close(AutoCloseable obj) { if (obj != null) { try { obj.close(); } catch (Exception e) { e.printStackTrace(); } } } }
import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; /** * 爬取网页内容 * @author Administrator * */ public class WebUtil { /** * 将爬取的内容以字符形式返回 * @param urlStr * @param encoding * @return */ public static String urlGetString(String urlStr, String encoding) { StringBuffer sb = new StringBuffer(); URL url = null; URLConnection conn = null; BufferedReader br = null; try { url = new URL(urlStr); conn = url.openConnection(); br = new BufferedReader(new InputStreamReader( conn.getInputStream(), encoding)); String line = null; while ((line = br.readLine()) != null) { sb.append(line).append(System.lineSeparator()); } } catch (Exception e) { e.printStackTrace(); } finally { CloseUtil.close(br); } return sb.toString(); } /** * 将爬取内容以字节数组形式返回 * 以便查取对应新闻内容的长度 * @param urlStr * @return */ public static byte[] urlGetByteArray(String urlStr) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); BufferedInputStream bis = null; byte[] byteArray = new byte[0]; try { URL url = new URL(urlStr); URLConnection conn = url.openConnection(); bis = new BufferedInputStream(conn.getInputStream()); int b = -1; while ((b = bis.read()) != -1) { baos.write(b); } byteArray = baos.toByteArray(); } catch (Exception e) { e.printStackTrace(); } finally { CloseUtil.close(bis); CloseUtil.close(baos); } return byteArray; } }
import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 利用正则表达式从爬去的内容中 * 筛选出需要的内容 * @author Administrator * */ public class RegexUtil { public static String match(String input, String regex) { StringBuffer sb = new StringBuffer(); Pattern p = Pattern.compile(regex); Matcher m = p.matcher(input); while (m.find()) { String result = m.group(); sb.append(result); } return sb.toString(); } public static String match(String input, String regex, int grpNum) { String result = ""; Pattern p = Pattern.compile(regex); Matcher m = p.matcher(input); while (m.find()) { result = m.group(grpNum); } return result; } public static List<String> matchList(String input, String regex) { List<String> list = new ArrayList<String>(); Pattern p = Pattern.compile(regex); Matcher m = p.matcher(input); while (m.find()) { String result = m.group(); list.add(result); } return list; } }
import java.io.FileOutputStream; import java.io.OutputStream; import java.io.PrintWriter; /** * 将爬取的内容输出到硬盘 * @author Administrator * */ public class IOUtil { public static void writeDataFile(String dataFile, byte[] ba){ OutputStream os = null; try{ os = new FileOutputStream(dataFile, true); os.write(ba); }catch(Exception e){ e.printStackTrace(); }finally{ CloseUtil.close(os); } } public static void writeIndexFile(String indexFile,String str){ PrintWriter pw = null; try{ pw = new PrintWriter(new FileOutputStream(indexFile, true)); pw.println(str); }catch(Exception e){ e.printStackTrace(); }finally{ CloseUtil.close(pw); } } }
2、准备就绪开始爬取
import java.io.File; import java.util.List; import cn.dd.util.IOUtil; import cn.dd.util.RegexUtil; import cn.dd.util.WebUtil; public class Spider { public static void main(String[] args) { Spider.crawler(); } public static void crawler() { String urlStr = "http://roll.news.sina.com.cn/news/gnxw/gdxw1/index.shtml"; String encoding = "gb2312"; String input = WebUtil.urlGetString(urlStr, encoding);// 爬取列表页内容 String ulRegex = "<ul class="list_009">[\s\S]*?</ul>";// 正则表达式 String ulResult = RegexUtil.match(input, ulRegex); String liRegex = "<li>[\s\S]*?</li>";// 正则表达式 List<String> list = RegexUtil.matchList(ulResult, liRegex); for (String str : list) { String grpRegex = "<li><a href="([\S]*?)" target="_blank">([\S\s]*?)</a><span>\(([\S]*?) [\S]*?\)</span></li>"; String liUrlStr = RegexUtil.match(str, grpRegex, 1); String liTitle = RegexUtil.match(str, grpRegex, 2); String liDate = RegexUtil.match(str, grpRegex, 3); Spider.detailProcessor(liUrlStr, liTitle, liDate); } } public static void detailProcessor(String liUrlStr, String liTitle, String liDate) { byte[] ba = WebUtil.urlGetByteArray(liUrlStr);// 爬取详情页 String fileBaseDir = "F:" + File.separator + "something" + File.separator + liDate + File.separator; File fileBaseDirObj = new File(fileBaseDir); if (!fileBaseDirObj.exists()) { fileBaseDirObj.mkdirs(); } String dataPath = fileBaseDir + "spider_data.dat"; String indexPath = fileBaseDir + "spider_index.dat"; File dataFile = new File(dataPath); long pos = dataFile.length(); StringBuffer sb = new StringBuffer(); char c = 'u0001';// 将储存的索引文件各部分用分隔符间隔开。 sb.append(liTitle).append(c).append(pos).append(c).append(ba.length) .append(c).append(liUrlStr); IOUtil.writeDataFile(dataPath, ba); IOUtil.writeIndexFile(indexPath, sb.toString()); } }
爬取内容基本完成,运行可生成对应要求的数据文件和索引文件
三、客户端的建立
1、编写所需工具包
import java.io.RandomAccessFile; public class IndexUtil { public static String index(String pos, String size, String dataFile) { String encoding = "utf-8"; String str = ""; RandomAccessFile raf = null; try { raf = new RandomAccessFile(dataFile, "r"); Long po = Long.valueOf(pos); raf.seek(po); Integer si = Integer.valueOf(size); byte[] b = new byte[si]; raf.read(b); str = new String(b, encoding); } catch (Exception e) { e.printStackTrace(); } finally { CloseUtil.close(raf); } return str; } public static String index(String pos, String size, String dataFile,String encoding) { String str = ""; RandomAccessFile raf = null; try { raf = new RandomAccessFile(dataFile, "r"); Long po = Long.valueOf(pos); raf.seek(po); Integer si = Integer.valueOf(size); byte[] b = new byte[si]; raf.read(b); str = new String(b, encoding); } catch (Exception e) { e.printStackTrace(); } finally { CloseUtil.close(raf); } return str; } }
2、编写客户端
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import cn.dd.util.CloseUtil; import cn.dd.util.IndexUtil; public class SpiderIndex { public static void main(String[] args) { String str = "http://news.sina.com.cn/c/nd/2018-08-23/doc-ihicsiav8438010.shtml"; SpiderIndex.input(str); } public static void input(String str) { String indexFile = "F:" + File.separator+ "/something/08月23日/spider_index.dat"; String dataFile = "F:" + File.separator+ "/something/08月23日/spider_data.dat"; BufferedReader bu = null; try { bu = new BufferedReader(new InputStreamReader(new FileInputStream(indexFile), "utf-8")); String len = null; while ((len = bu.readLine()) != null) { String[] st = len.split("u0001"); if (str.equals(st[3])) { String s = IndexUtil.index(st[1], st[2], dataFile); System.out.println(s); break; } } } catch (Exception e) { e.printStackTrace(); } finally { CloseUtil.close(bu); } } }