开始练习网络爬虫
package com.sreach.image;
import java.io.IOException;
import java.util.LinkedList;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.junit.Test;
/**
* 处理下载链接目的获取图片链接地址
*
* */
public class DownImageAfter {
private static LinkedList<String> urlQueue1 = new LinkedList<String>();
@Test
public void test() throws Exception {
String url = "http://search.jd.com/Search?keyword=php";
String content = getHttpClentResult(url);
getTwoEndLink(content);
for (int i = 0; i < urlQueue1.size();) {
String ra_url = urlQueue1.remove();
String ts = getHttpClentResult(ra_url);
String tss = getJD_end_Sevel_URL(ts);
ImageDown.getResult(tss);
}
}
/**
* 抓取京东最后一层的真实图片
*
* @param content
* @return
* */
private final static String getJD_end_Sevel_URL(String content) {
// System.out.println(content);
if (content.split("jqimg")[1].substring(0, 1).equals(""")) {
return content.split(""jqimg":"")[1].split(""")[0].replaceAll(
"/", "").replaceAll("\\", "/");
} else {
String tes = content.split("jqimg="")[1].split(""")[0];
return tes;
}
}
/**
* 根据url通过httpclent获取内容
*
* @param url
* @return
* @throws IOException
* @throws ClientProtocolException
* */
private final static String getHttpClentResult(String url)
throws ClientProtocolException, IOException {
HttpClient client = new DefaultHttpClient();
HttpGet get$ = new HttpGet(url);
get$.setHeader("User-Agent",
" Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0");
HttpResponse response = client.execute(get$);
HttpEntity entity = response.getEntity();
return EntityUtils.toString(entity);
}
/**
* 抓取后尾第二层链接到linked集合中
* */
private final static void getTwoEndLink(String content) {
String ts[] = content.split("<div class="p-name">");
for (int i = 1; i < ts.length; i++) {
String t1 = ts[i].split("href="")[1].split(""")[0];
urlQueue1.addLast(t1);
}
}
}
package com.sreach.image;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Random;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import com.sreach.util.StringUtil;
/** 图片测试下载 */
public class ImageDown {
// 图片存储路径
private final static String SAVEGB_PATH = "D:\图片下载\";
// jpg格式
private final static String IMAGEPATTEN_JPG = ".jpg";
/**
* 根据链接后缀是否为图片作为判断
*
* @param 文件原始超链接
* @return
* */
public final static Boolean getResult(String imageUrlPath) {
Boolean tag = false;
if (imageUrlPath.endsWith(IMAGEPATTEN_JPG)) {
tag = true;
saveIMAGE(imageUrlPath);
}
return tag;
}
/**
* 通过超链接保存图片
*
* @param 文件原始超链接
* */
private final static void saveIMAGE(String imageUrlPath) {
HttpClient client = new DefaultHttpClient();
try {
HttpGet get$ = new HttpGet(imageUrlPath);
HttpResponse response = client.execute(get$);
HttpEntity entity = response.getEntity();
byte bs[] = null;
if (entity != null) {
bs = EntityUtils.toByteArray(entity);
}
StringBuilder bbs = new StringBuilder();
bbs.append(SAVEGB_PATH).append(
getImageSourceWebsite(imageUrlPath));
File file = new File(bbs.toString());
if (!file.exists()) {
file.mkdir();
}
savePC_GB(getFileName(imageUrlPath, bbs.append("\").toString()),
bs);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 保存图片到本地硬盘
*
* @param filepath
* 文件路径,content 内容字节文件
* */
private final static void savePC_GB(String filepath, byte[] content)
throws IOException {
File f = new File(filepath);
if (f.exists()) {
f = new File(filepath.replace(".jpg",
Integer.toString(new Random().nextInt()) + ".jpg"));
}
BufferedOutputStream out = new BufferedOutputStream(
new FileOutputStream(f));
out.write(content);
out.close();
System.out.println("下载" + f.getName() + "图片成功");
}
/**
* 获取图片文件名
*
* @原始文件链接地址(包含来源网站)
* @return
* */
private final static String getFileName(String imageUrlPath,
String realPath) {
StringBuilder bs = new StringBuilder();
bs.append(realPath);
bs.append(StringUtil.getDateAndTime("YYYYMMddHHmmss"));
bs.append("-");
String path = getImageSourceWebsite(imageUrlPath);
bs.append(path);
bs.append(".jpg");
return bs.toString();
}
public final static String getImageSourceWebsite(String imageUrlPath) {
return imageUrlPath.split("http://")[1].split("/")[0];
}
public static void main(String[] args) {
String imageUrlPath = "http://img12.360buyimg.com/n0/g14/M05/05/16/rBEhVVHmJ0YIAAAAAAHB_WJwDrkAABHIgOyzAcAAcIV854.jpg";
getResult(imageUrlPath);
}
}