• java 爬虫 爬取豆瓣 请不要害羞 图片


    import org.apache.http.HttpEntity;
    import org.apache.http.HttpResponse;
    import org.apache.http.client.HttpClient;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.DefaultHttpClient;
    
    import java.io.*;
    import java.net.HttpURLConnection;
    import java.net.URL;
    import java.text.SimpleDateFormat;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    /**
     * Created by liwj on 2017/5/25.
     */
    public class Spider {
    
        private static String IMAGE_REG = "(https://img1.doubanio.com/view/group_topic/large/public/p)[0-9]{0,}(.jpg)";
        private static String HTTP_REG = "(https://www.douban.com/group/topic/)[0-9]{0,}(/)";
        private static String FILE_NAME="[0-9]{0,}(.jpg)";
        private static SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    
        /**
         * 根据url获取网页源码
         *
         * @param url
         * @return
         */
        private static String getResultByUrl(String url) {
            HttpClient hc = new DefaultHttpClient();
            try {
                HttpGet httpget = new HttpGet(url);
                httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13");
                httpget.setHeader("Accept-Encoding", "utf-8");
                HttpResponse response = hc.execute(httpget);
                HttpEntity entity = response.getEntity();
                if (entity != null) {
                    InputStream in = entity.getContent();
                    BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
                    StringBuffer buffer = new StringBuffer();
                    String line = "";
                    while ((line = br.readLine()) != null) {
                        buffer.append(line);
                    }
                    in.close();
                    entity.getContent().close();
                    return buffer.toString();
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
            return "";
        }
    
        /**
         * 获取帖子或者图片url
         *
         * @param html
         * @return
         */
        private static List<String> getAllUrl(String reg, String html) {
            List<String> urls = new ArrayList<String>();
    
            Pattern pattern = Pattern.compile(reg);
            Matcher matcher = pattern.matcher(html);
            while (matcher.find()) {
                urls.add(matcher.group());
            }
            return urls;
        }
    
        /**
         * 下载文件
         * @param fileUrl
         * @param fileName
         * @param savePath
         * @throws Exception
         */
        private static void downloadFileFromUrl(String fileUrl, String fileName, String savePath) throws Exception {
            //获取连接
            URL url = new URL(fileUrl);
            HttpURLConnection connection = (HttpURLConnection) url.openConnection();
            connection.setConnectTimeout(3 * 1000);
            //设置请求头
            connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36");
            //获取输入流
            InputStream in = connection.getInputStream();
    
            File saveDir = new File(savePath);
            if (!saveDir.exists()) {
                saveDir.mkdirs();
            }
            File file = new File(savePath + fileName);
    
            OutputStream out = new FileOutputStream(file);
    
            byte[] bytes = new byte[1024];
            int len = 0;
            while ((len = in.read(bytes)) != -1) {
                out.write(bytes, 0, len);
            }
            out.close();
            in.close();
        }
    
        public static void main(String[] args) {
            for (int page = 25; page <= 25; page += 25) {
                String url = "https://www.douban.com/group/haixiuzu/discussion?start=" + page;
                String html=getResultByUrl(url);
                //System.out.println(html);
                List<String> webPages=getAllUrl(HTTP_REG,html);
                for(String webPage:webPages){
                    String webHtml=getResultByUrl(webPage);
                    List<String> images=getAllUrl(IMAGE_REG,webHtml);
                    for(String image:images){
                        String fileName="";
                        Matcher matcher=Pattern.compile(FILE_NAME).matcher(image);
                        if(matcher.find()){
                            fileName=matcher.group();
                        }
    
                        try {
                            downloadFileFromUrl(image,fileName,"E:\image\");
                            System.out.println(df.format(new Date())+" 图片保存成功------["+fileName+"]");
                        }catch (Exception e){
                            System.err.println(df.format(new Date())+" 图片保存失败------["+fileName+"]");
                        }
                    }
                }
            }
        }
    }
  • 相关阅读:
    如何稳定地使用 Google 搜索https://encrypted.google.com/
    widows 2008 同步时间命令
    MySql 初始化权限脚本
    [转] windows下Svn服务器之必须提交修改注释篇
    给编译好的DLL增加签名
    Anychart 破解备注
    Javascript 日期时间格式正则
    微服务项目规范(二)
    微服务项目规范(一)
    mac系统安装、启动与关闭redis
  • 原文地址:https://www.cnblogs.com/zuferj115/p/6904977.html
Copyright © 2020-2023  润新知