• java小爬虫


    爬取煎蛋网

    1、找出页面网址的规律

    2、设计页面图片网址的正则

    代码:

    import java.io.BufferedInputStream;
    import java.io.BufferedOutputStream;
    import java.io.BufferedReader;
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.FileWriter;
    import java.io.InputStreamReader;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.util.ArrayList;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    public class SpiderTest {
        
        private static ArrayList<String> urlStrs = new ArrayList<String>();
        private static String regx = ""[\S]*\.(jpg|gif)";   //读取jpg和gif图片的正则
        private static int num = 0;        //图片名递增量
        
        public static void main(String[] args)throws Exception{
            //String urlStr = "http://jandan.net/ooxx/page-2381#comments";        //要抓取的煎蛋妹子网页示例
            String urlStr="";
            String dstDir = "d:/dstDir";
            int start = 2340;    //起始页
            int end = 2370;        //结束页
        
            for(int i=start;i<=end;i++){
                urlStr = "http://jandan.net/ooxx/page-"+i+"#comments";
                matchAll(urlStr);
                if(urlStrs.size() > 0){
                    for(String imgStr:urlStrs){
                        downFile(imgStr,dstDir);    
                        Thread.sleep(300);    //休息一会
                    }
                }
                urlStrs.clear();
            }
            System.out.println("网址抓取完毕");
        }
        /*
         * @param:urlStr 要爬取的网址
         */
        private static void matchAll(String urlStr)throws Exception{
            Pattern p = Pattern.compile(regx);
            Matcher m;
            URL url;
            try {
                url = new URL(urlStr);
            } catch (MalformedURLException e) {
                throw new Exception("网址不存在");
            }
            
            BufferedReader read= new BufferedReader(new InputStreamReader(url.openStream()));
            String line = "";
            while((line = read.readLine()) != null){
                m = p.matcher(line);
                while(m.find()){
                    System.out.println(m.group());
                    urlStrs.add("http:"+m.group().substring(1));        //将图片网址添加到ArrayList(过滤第一个双引号)
                }
            }
            read.close();
        }
        /*下载指定图片网址的图片
         * @param:urlStr 图片网址
         * @param:dstDir 图片存放目录
         */
        private static void downFile(String urlStr,String dstDir)throws Exception{
            byte[] bBuf = new byte[1024];
            File dir = new File(dstDir);
            String fileName = "";
            if(!dir.exists()){
                dir.mkdir();
            }
            if(urlStr.endsWith("jpg")){
                fileName = (num++) + ".jpg";
            }else if(urlStr.endsWith("gif")){
                fileName = (num++) + ".gif";
            }
            File imgFile = new File(dstDir,fileName);
            //if(imgFile.exists()){
            //    TODO..
            //}
            URL url = new URL(urlStr);
            BufferedInputStream in = new BufferedInputStream(url.openStream());
            BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(imgFile));
            
            System.out.println("开始下载。。");
            int len = 0;
            while((len = in.read(bBuf)) != -1){
                out.write(bBuf,0,len);
            }
            System.out.println("下载完毕");
            in.close();
            out.close();
        }
        /*
         * 获取网页源码(此方法没有使用)
         */
        private void getSourceCode(String u)throws Exception{
            //String u = "http://m.onepiece.cc/post/10001/";
            File f = new File("d:/tmp.txt");
            if(!f.exists()){
                f.createNewFile();
            }
            URL url = new URL(u);
            BufferedReader read = new BufferedReader(new InputStreamReader(url.openStream()));
            BufferedWriter write = new BufferedWriter(new FileWriter(f));
            String s = "";
            while((s=read.readLine()) != null){
                write.write(s);
                write.write('
    ');
            }
            System.out.println("拷贝完成");
            read.close();
            write.close();
        }
    }

  • 相关阅读:
    使用SO_REVTIMEO套接字选项为recvfrom设置超时
    使用select为描述符设置超时
    套接字超时设置方法
    使用SIGALARM为recvfrom设置超时
    使用SIGALARM为connect设置超时
    20200410 阿里巴巴Java开发手册
    20200409 Vue 视频学习【归档】
    20200319 Spring MVC 官方文档【归档】
    20200319 Spring Web MVC 2-5
    20200319 Spring Web MVC 1
  • 原文地址:https://www.cnblogs.com/boluoboluo/p/6511061.html
Copyright © 2020-2023  润新知