思路:根据给定URL分析其源码,得到所需的网页内容的位置,制定规则采集或下载之
采集的图片和文字示例:
tags:
tag:brazil
tag:dog
tag:pet
tag:pointyfaceddog
tag:Pets Around the World
imageUrl:http://farm2.staticflickr.com/1241/1050065123_9739d1283a_z.jpg
:
import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import org.jsoup.*; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * @author ZTX * jsoup下载测试 */ public class TestSoup { //待采集网址 private static String url = "http://www.flickr.com/photos/9339954@N03/1050065123/in/photolist-2AMRM6-3JesJx-4n7zTL-4sVi7P-4t3yt6-4Ay5SR-4SV4Dz-57LhgB-5g78Rp-5huiYa-5jmQqP-5qawPz-5qaAFP-5qeRNy-5qf2gd-5qfdSQ-5qfe33-5qfKSf-5wYskN-5XxHra-5YcEX1-64zW7q-66U2Lp-67nNta-6aJgvN-6eFSCL-6fiVqR-6ghPXc-6nFAhA-6oZZso-6uQTsi-6vizoJ-6Dpn3e-6EuENc-6QK6TG-6Z4BBW-7347jr-7347nt-7347ra-7347ta-754FLt-78SRmT-7jdxPt-7vNc4q-h3h9sV-9JA3zQ-h3oJ39-h3gL6w-hrN3DF-ek7Tkt-9JA8ns"; //采集的图片存放路径 private static String imgPath = "./download/img.jpg"; public static void main(String []args){ // 采集和下载 getHTML(url); downloadImg(getFlickrImgUrl(url),imgPath); } /** * @param url * 根据网址采集网页HTML文字内容 */ private static void getHTML(String url) { // 使用jsoup选择器语法,链接传递过来的url,并赋值给Document Document doc; try { doc = Jsoup.connect(url).get(); Element tagsUL = doc.getElementById("thetags");//tags ul Elements tags =tagsUL.getElementsByTag("li"); System.out.println("tags:"); for(Element i:tags) { String tag=i.tagName(); System.out.println("tag:"+i.text()); } } catch (IOException e) { e.printStackTrace(); } } /** * 根据图片网址下载图片 * 2013-12-15 20:41:54 * @param realurl */ private static void downloadImg(String imageUrl,String imgPath) { try { //下载 URL downloadUrl = new URL( imageUrl); URLConnection uc = downloadUrl.openConnection(); InputStream is = uc.getInputStream(); File file = new File( imgPath); FileOutputStream out = new FileOutputStream(file); int i=0; while ((i=is.read())!=-1) { out.write(i); } is.close(); } catch (IOException e) { e.printStackTrace(); } } /** * 根据网址得到图片URL * 2013-12-15 20:41:17 * @param url * @return * @throws IOException */ private static String getFlickrImgUrl(String url) { Document doc = null; try { doc = Jsoup.connect(url).get(); } catch (IOException e) { e.printStackTrace(); } String imageUrl = doc .getElementById("main-photo-container") .getElementsByTag("img") .first() .absUrl("src"); System.out.println("imageUrl:"+imageUrl); return imageUrl; } }