• 多线程爬虫Java调用wget下载文件,独立线程读取输出缓冲区


    写了个抓取appstore的,要抓取大量的app,本来是用httpclient,但是效果不理想,于是直接调用wget下载,但是由于标准输出、错误输出的原因会导致卡住,另外wget也会莫名的卡住。

    所以我采用:

    一、独立线程读取输出信息;

    二、自己实现doWaitFor方法来代替api提供的waitFor()方法,避免子进程卡死。

    三、设置超时,杀死wget子进程,没有正确返回的话,重试一次,并把超时时间加倍;

     有了以上操作,wget不会卡死,就算卡住了也会因为超时被干掉再重试一次,所以绝大部分的app可以被抓取下来。

    import com.google.common.io.Files;
    import com.xxx.appstore.service.crawler.CalcMD5Service;
    import org.apache.commons.lang.StringUtils;
    import org.apache.commons.lang.math.RandomUtils;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    
    import java.io.File;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.concurrent.TimeUnit;
    
    
    public class CrawlerUtils {
    
        public static final String APK_DOWNLOAD_PATH = "/data/appstore/category/";
        private static Logger LOGGER = LoggerFactory.getLogger(CrawlerUtils.class);
    
        /**
         * 使用wget下载文件
         *
         * @param displayName  appName
         * @param category     分类
         * @param download_url 下载地址
         * @return 成功返回文件路径,失败返回null
         */
        public static String downloadFileByWget(String displayName, String category, String download_url) {
            if (StringUtils.isBlank(displayName) || StringUtils.isBlank(category) || StringUtils.isBlank(download_url)) {
                LOGGER.info("downloadFileByWget ERROR, displayName:{}, category:{}, download_url:{}", new Object[]{displayName, category, download_url});
                return null;
            }
            String fileName = CalcMD5Service.encoder(displayName + RandomUtils.nextInt(1000));
            String seed = CalcMD5Service.encoder(category);
            String midPath = StringUtils.left(seed, 10);
            String filePath = APK_DOWNLOAD_PATH + midPath + "/" + fileName + ".apk";
            File file = new File(filePath);
            try {
                Files.createParentDirs(file);
            } catch (IOException e) {
                LOGGER.warn("IOException", e);
                return null;
            }
            int retry = 2;
            int res = -1;
            int time = 1;
            while (retry-- > 0) {
                ProcessBuilder pb = new ProcessBuilder("wget", download_url, "-t", "2", "-T", "10", "-O", filePath);
                LOGGER.info("wget shell: {}", pb.command());
                Process ps = null;
                try {
                    ps = pb.start();
                } catch (IOException e) {
                    LOGGER.error("IOException", e);
                }
                res = doWaitFor(ps, 30 * time++);
                if (res != 0) {
                    LOGGER.warn("Wget download failed...");
                } else {
                    break;
                }
            }
            if (res != 0) {
                return null;
            }
            return filePath;
        }
    
    
        /**
         * @param ps      sub process
         * @param timeout 超时时间,SECONDS
         * @return 正常结束返回0
         */
        private static int doWaitFor(Process ps, int timeout) {
            int res = -1;
            if (ps == null) {
                return res;
            }
            List<String> stdoutList = new ArrayList<>();
            List<String> erroroutList = new ArrayList<>();
            boolean finished = false;
            int time = 0;
            ThreadUtil stdoutUtil = new ThreadUtil(ps.getInputStream(), stdoutList);
            ThreadUtil erroroutUtil = new ThreadUtil(ps.getErrorStream(), erroroutList);
            //启动线程读取缓冲区数据
            stdoutUtil.start();
            erroroutUtil.start();
            while (!finished) {
                time++;
                if (time >= timeout) {
                    LOGGER.info("Process wget timeout 30s, destroyed!");
                    ps.destroy();
                    break;
                }
                try {
                    res = ps.exitValue();
                    finished = true;
                } catch (IllegalThreadStateException e) {
                    try {
                        TimeUnit.SECONDS.sleep(1);
                    } catch (InterruptedException e1) {
    
                    }
                }
            }
            return res;
        }
    }
    import org.apache.commons.io.Charsets;
    
    import java.io.BufferedReader;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.util.List;
    
    
    public class ThreadUtil implements Runnable {
        // 设置读取的字符编码
        private String character = Charsets.UTF_8.displayName();
        private List<String> list;
        private InputStream inputStream;
    
        public ThreadUtil(InputStream inputStream, List<String> list) {
            this.inputStream = inputStream;
            this.list = list;
        }
    
        public void start() {
            Thread thread = new Thread(this);
            thread.setDaemon(true);//将其设置为守护线程
            thread.start();
        }
    
        public void run() {
            BufferedReader br = null;
            try {
                br = new BufferedReader(new InputStreamReader(inputStream, character));
                String line = null;
                while ((line = br.readLine()) != null) {
                    list.add(line);
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                try {
                    //释放资源
                    inputStream.close();
                    if (br != null) {
                        br.close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    
    }
  • 相关阅读:
    Java基础五
    Java基础测试
    Java练习题
    Java基础四
    Java基础三
    Java基础二
    Java基础一
    大数据讲解
    python笔记之函数 二
    iOS UICollectionView的使用(用storyboard和xib创建)
  • 原文地址:https://www.cnblogs.com/aboutblank/p/4256461.html
Copyright © 2020-2023  润新知