参考:http://blog.csdn.net/sdfiiiiii/article/details/70432060 http://blog.csdn.net/qy20115549/article/details/54945974
第一篇博客可以获取http://www.xicidaili.com/网站上所有的代理ip,并测试可不可以用(貌似不是很准),可用的代理ip放到一个list中
第二篇博客是直接将代理ip设置进代码内,可以用作测试ip可不可用
第一篇博客
<dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.28</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency>
import com.alibaba.fastjson.JSONObject; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 获取代理IP,需要 * com.alibaba.fastjson.JSONObject以及Jsoup */ public class ProxyCralwerUnusedVPN { ThreadLocal<Integer> localWantedNumber = new ThreadLocal<Integer>(); ThreadLocal<List<ProxyInfo>> localProxyInfos = new ThreadLocal<List<ProxyInfo>>(); public static void main(String[] args) { ProxyCralwerUnusedVPN proxyCrawler = new ProxyCralwerUnusedVPN(); /** * 想要获取的代理IP个数,由需求方自行指定。(如果个数太多,将导致返回变慢) */ proxyCrawler.startCrawler(1); } /** * 暴露给外部模块调用的入口 * @param wantedNumber 调用方期望获取到的代理IP个数 */ public String startCrawler(int wantedNumber) { localWantedNumber.set(wantedNumber); kuaidailiCom("http://www.xicidaili.com/nn/", 15); kuaidailiCom("http://www.xicidaili.com/nt/", 15); kuaidailiCom("http://www.xicidaili.com/wt/", 15); kuaidailiCom("http://www.kuaidaili.com/free/inha/", 15); kuaidailiCom("http://www.kuaidaili.com/free/intr/", 15); kuaidailiCom("http://www.kuaidaili.com/free/outtr/", 15); /** * 构造返回数据 */ ProxyResponse response = new ProxyResponse(); response.setSuccess("true"); Map<String, Object> dataInfoMap = new HashMap<String, Object>(); dataInfoMap.put("numFound", localProxyInfos.get().size()); dataInfoMap.put("pageNum", 1); dataInfoMap.put("proxy", localProxyInfos.get()); response.setData(dataInfoMap); String responseString = JSONObject.toJSON(response).toString(); System.out.println(responseString); return responseString; } private void kuaidailiCom(String baseUrl, int totalPage) { String ipReg = "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} \d{1,6}"; Pattern ipPtn = Pattern.compile(ipReg); for (int i = 1; i < totalPage; i++) { if (getCurrentProxyNumber() >= localWantedNumber.get()) { return; } try { Document doc = Jsoup.connect(baseUrl + i + "/") .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") .header("Accept-Encoding", "gzip, deflate, sdch") .header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6") .header("Cache-Control", "max-age=0") .header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36") .header("Cookie", "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=GA1.2.1061361785.1462812244") .header("Host", "www.kuaidaili.com") .header("Referer", "http://www.kuaidaili.com/free/outha/") .timeout(30 * 1000) .get(); Matcher m = ipPtn.matcher(doc.text()); while (m.find()) { if (getCurrentProxyNumber() >= localWantedNumber.get()) { break; } String[] strs = m.group().split(" "); if (checkProxy(strs[0], Integer.parseInt(strs[1]))) { System.out.println("获取到可用代理IP " + strs[0] + " " + strs[1]); addProxy(strs[0], strs[1], "http"); } } } catch (Exception e) { e.printStackTrace(); } } } private static boolean checkProxy(String ip, Integer port) { try { //http://1212.ip138.com/ic.asp 可以换成任何比较快的网页 Jsoup.connect("http://1212.ip138.com/ic.asp") .timeout(2 * 1000) .proxy(ip, port) .get(); return true; } catch (Exception e) { return false; } } private int getCurrentProxyNumber() { List<ProxyInfo> proxyInfos = localProxyInfos.get(); if (proxyInfos == null) { proxyInfos = new ArrayList<ProxyInfo>(); localProxyInfos.set(proxyInfos); return 0; } else { return proxyInfos.size(); } } private void addProxy(String ip, String port, String protocol){ List<ProxyInfo> proxyInfos = localProxyInfos.get(); if (proxyInfos == null) { proxyInfos = new ArrayList<ProxyInfo>(); proxyInfos.add(new ProxyInfo(ip, port, protocol)); } else { proxyInfos.add(new ProxyInfo(ip, port, protocol)); } } } class ProxyInfo { private String userName = ""; private String ip; private String password = ""; private String type; private String port; private int is_internet = 1; public ProxyInfo(String ip, String port, String type) { this.ip = ip; this.type = type; this.port = port; } public String getUserName() { return userName; } public void setUserName(String userName) { this.userName = userName; } public String getIp() { return ip; } public void setIp(String ip) { this.ip = ip; } public String getPassword() { return password; } public void setPassword(String password) { this.password = password; } public String getType() { return type; } public void setType(String type) { this.type = type; } public String getPort() { return port; } public void setPort(String port) { this.port = port; } public int getIs_internet() { return is_internet; } public void setIs_internet(int is_internet) { this.is_internet = is_internet; } } class ProxyResponse { private String success; private Map<String, Object> data; public String getSuccess() { return success; } public void setSuccess(String success) { this.success = success; } public Map<String, Object> getData() { return data; } public void setData(Map<String, Object> data) { this.data = data; } }
第二篇博客
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.InetSocketAddress; import java.net.MalformedURLException; import java.net.Proxy; import java.net.URL; import java.net.URLConnection; public class GetHtml { public static void main(String[] args) throws UnsupportedEncodingException { //输入代理ip,端口,及所要爬取的url gethtml("121.61.101.222",808,"http://club.autohome.com.cn/bbs/forum-c-2533-1.html?orderby=dateline&qaType=-1"); } public static String gethtml(String ip,int port,String url) throws UnsupportedEncodingException{ URL url1 = null; try { url1 = new URL(url); } catch (MalformedURLException e1) { e1.printStackTrace(); } InetSocketAddress addr = null; //代理服务器的ip及端口 addr = new InetSocketAddress(ip, port); Proxy proxy = new Proxy(Proxy.Type.HTTP, addr); // http proxy InputStream in = null; try { URLConnection conn = url1.openConnection(proxy); conn.setConnectTimeout(3000); in = conn.getInputStream(); } catch (Exception e) { System.out.println("ip " + " is not aviable");//异常IP } String s = convertStreamToString(in); System.out.println(s); return s; } public static String convertStreamToString(InputStream is) throws UnsupportedEncodingException { if (is == null) return ""; BufferedReader reader = new BufferedReader(new InputStreamReader(is,"gb2312")); StringBuilder sb = new StringBuilder(); String line = null; try { while ((line = reader.readLine()) != null) { sb.append(line + "/n"); } } catch (IOException e) { e.printStackTrace(); } finally { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } return sb.toString(); } }