• Java Socket 爬虫


    # 地址

    https://github.com/mofadeyunduo/crawler

    # 前言

    1、代码不断优化更新。

    2、有建议请留言。

    # 介绍

    1、多线程,基于 ExcutorServcie。

    2、使用 Socket 进行 HTTP 请求。

    # 优化想法

    1、线程复用,不为每一个网页单独创建一个线程,每个 Crawler 负责多个网页的爬取。

    2、多个网页进行一次读写,减少 IO 时间(待实现)。

    3、多代理,防止请求过多,服务器拒绝响应(待实现)。

    # 代码

    SocketCrawler.java:负责爬取网页。

    package per.piers.crawler.service;
    
    import org.apache.logging.log4j.LogManager;
    import org.apache.logging.log4j.Logger;
    import org.w3c.dom.Document;
    import org.w3c.dom.NamedNodeMap;
    import org.w3c.dom.NodeList;
    import org.xml.sax.SAXException;
    import per.piers.crawler.model.HTTPStatus;
    
    import javax.xml.parsers.DocumentBuilder;
    import javax.xml.parsers.DocumentBuilderFactory;
    import javax.xml.parsers.ParserConfigurationException;
    import java.io.*;
    import java.net.Socket;
    import java.util.*;
    import java.util.concurrent.ExecutorService;
    import java.util.concurrent.TimeUnit;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    /**
     * Created by Piers on 2017/4/15.
     */
    public class SocketCrawler implements Runnable {
    
        private Task task;
        private static Logger logger = LogManager.getLogger(SocketCrawler.class.getName());
        private Map<String, String> headers = new LinkedHashMap<>();
        private LinkedList<String> websites;
        private String charset = "utf-8";
        private ExecutorService executorService;
        private String outputPath;
    
        public SocketCrawler(LinkedList<String> websites, String outputPath, ExecutorService executorService, Task task) {
            this(websites, outputPath, null, null, executorService, task);
        }
    
        public SocketCrawler(LinkedList<String> websites, String outputPath, String charset, ExecutorService executorService, Task task) {
            this(websites, outputPath, charset, null, executorService, task);
        }
    
        public SocketCrawler(LinkedList<String> websites, String outputPath, String charset, Map<String, String> headers, ExecutorService executorService, Task task) {
            if (websites != null) {
                this.websites = websites;
            } else {
                throw new NullPointerException("websites is null");
            }
            if (executorService != null) {
                this.executorService = executorService;
            } else {
                throw new NullPointerException("executorService is null");
            }
            if (outputPath != null) {
                this.outputPath = outputPath;
                new File(outputPath).mkdirs();
            } else {
                throw new NullPointerException("outputPath is null");
            }
            if (task != null) {
                this.task = task;
            } else {
                throw new NullPointerException("task is null");
            }
            if (charset != null) this.charset = charset;
            logger.debug("Charset: {}", this.charset);
            if (headers != null) this.headers.putAll(headers);
            try {
                DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
                DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
                Document document = documentBuilder.parse(new File("target/classes/defaultHeaders.xml"));
                NodeList nodeList = document.getElementsByTagName("header");
                for (int i = 0; i < nodeList.getLength(); i++) {
                    NamedNodeMap map = nodeList.item(i).getAttributes();
                    this.headers.put(map.getNamedItem("key").getNodeValue(), map.getNamedItem("value").getNodeValue());
                }
            } catch (ParserConfigurationException | IOException | SAXException e) {
                e.printStackTrace();
            }
        }
    
        public String crawl(String website) throws IOException {
            synchronized (task) {
                task.addCount();
                logger.info("Count: {}", task.getCount());
            }
            logger.traceEntry();
            logger.info("Crawling: {}", website);
            String[] resolves = resolveWebsite(website);
            String host = resolves[0], request = resolves[1];
            Socket socket = new Socket(host, 80);
            setOutputStream(socket.getOutputStream(), host, request);
            try {
                try (BufferedReader reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), charset))) {
                    String firstLine = reader.readLine();
                    HTTPStatus status = getStatus(firstLine);
                    if (status == null) {
                        String error = String.format("Unknown HTTP status: %s", website);
                        logger.error(error);
                        throw new IllegalStateException(error);
                    }
                    switch (status) {
                        case NOT_FOUND:
                            logger.warn("404: {}", website);
                            return null;
                    }
                    String line = null;
                    while ((line = reader.readLine()) != null && !line.equals("")) ;
                    StringBuilder builder = new StringBuilder();
                    while ((line = reader.readLine()) != null) {
                        builder.append(line + "
    ");
                    }
                    logger.info("Crawled: {}", website);
                    return builder.toString();
                }
            } finally {
                socket.close();
                logger.traceExit();
            }
        }
    
        private String[] resolveWebsite(String website) {
            Pattern pattern = Pattern.compile("http://(?<domain>[\w.]+)(?<request>/.*)?", Pattern.CASE_INSENSITIVE);
            Matcher matcher = pattern.matcher(website);
            if (!matcher.find()) {
                String error = String.format("Probably %s is not a valid website", website);
                logger.error(error);
                throw new InputMismatchException(error);
            }
            String host = matcher.group("domain");
            String request = matcher.group("request");
            if (request == null) request = "/";
            logger.debug("Domain is {}", host);
            logger.debug("Request is {}", request);
            return new String[]{host, request};
        }
    
        private void setOutputStream(OutputStream outputStream, String host, String request) throws IOException {
            BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, charset));
            String firstLine = String.format("GET %s HTTP/1.1", request);
            logger.debug("HTTP request: {}", firstLine);
            writer.write(firstLine);
            writer.newLine();
            String hostLine = String.format("Host: %s", host);
            logger.debug("HTTP request: {}", hostLine);
            writer.write(hostLine);
            writer.newLine();
            for (String key : headers.keySet()) {
                String entity = String.format("%s:%s", key, headers.get(key));
                logger.debug("HTTP request: {}", entity);
                writer.write(entity);
                writer.newLine();
            }
            writer.newLine();
            writer.flush();
        }
    
        private HTTPStatus getStatus(String firstLine) {
            Matcher matcher = Pattern.compile("HTTP/\d.\d (?<HTTPStatus>\d{3}) \w+").matcher(firstLine);
            if (matcher.find()) {
                switch (Integer.parseInt(matcher.group("HTTPStatus"))) {
                    case 200:
                        return HTTPStatus.OK;
                    case 404:
                        return HTTPStatus.NOT_FOUND;
                }
            }
            return null;
        }
    
        @Override
        public void run() {
            // TODO: replace with handler
            for (String website : websites) {
                if (!executorService.isShutdown()) {
                    try {
                        String result = crawl(website);
                        if (result != null) {
                            File file = new File(outputPath + "/" + website.replace("http://", "").replaceAll("[/.]", "_"));
                            logger.info("Writing data to {}", file.getAbsolutePath());
                            if (!file.exists()) file.createNewFile();
                            try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file)))) {
                                writer.write(result);
                                writer.flush();
                            }
                            logger.info("Has write {}", file.getAbsolutePath());
                        }
                        TimeUnit.SECONDS.sleep(new Random().nextInt(task.getTHREAD_SIZE() * 2));
                        synchronized (task) {
                            if (task.getCount() == task.getTASK_SIZE()) {
                                executorService.shutdown();
                            }
                        }
                    } catch (IOException e) {
                        logger.error(e.getMessage());
                        e.printStackTrace();
                    } catch (InterruptedException e) {
                        // e.printStackTrace();
                    }
                }
            }
        }
    
    }

    log4j2.xml

    <?xml version="1.0" encoding="UTF-8"?>
    <!--Configuration后面的status,这个用于设置log4j2自身内部的信息输出,可以不设置,当设置成trace时,你会看到log4j2内部各种详细输出。-->
    <!--monitorInterval:Log4j能够自动检测修改配置 文件和重新配置本身,设置间隔秒数。-->
    <configuration status="error" monitorInterval="30">
        <!--先定义所有的appender-->
        <appenders>
            <!--这个输出控制台的配置-->
            <Console name="Console" target="SYSTEM_OUT">
                <!--控制台只输出level及以上级别的信息(onMatch),其他的直接拒绝(onMismatch)-->
                <ThresholdFilter level="debug" onMatch="ACCEPT" onMismatch="DENY"/>
                <!--这个都知道是输出日志的格式-->
                <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} [%-5level] %class %t %M - %msg%n"/>
            </Console>
            <!--文件会打印出所有信息,这个log每次运行程序会自动清空,由append属性决定,这个也挺有用的,适合临时测试用-->
            <File name="log" fileName="log/test.log" append="false">
                <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} [%-5level] %class %t %M - %msg%n"/>
            </File>
            <!-- 这个会打印出所有的信息,每次大小超过size,则这size大小的日志会自动存入按年份-月份建立的文件夹下面并进行压缩,作为存档-->
            <RollingFile name="RollingFile" fileName="logs/app.log"
                         filePattern="log/%d{yyyy-MM}/app-%d{MM-dd-yyyy}-%i.log.gz">
                <PatternLayout pattern="%d{yyyy-MM-dd HH:mm:ss} [%-5level] %class %t %M - %msg%n"/>
                <SizeBasedTriggeringPolicy size="50MB"/>
                <!-- DefaultRolloverStrategy属性如不设置,则默认为最多同一文件夹下7个文件,这里设置了20 -->
                <DefaultRolloverStrategy max="20"/>
            </RollingFile>
        </appenders>
        <!--然后定义logger,只有定义了logger并引入的appender,appender才会生效-->
        <loggers>
            <!--建立一个默认的root的logger-->
            <root level="trace">
                <appender-ref ref="RollingFile"/>
                <appender-ref ref="Console"/>
            </root>
        </loggers>
    </configuration>

    defaultHeaders.xml

    <?xml version="1.0" encoding="utf-8"?>
    <headers>
        <header key="User-Agent" value="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"></header>
    </headers>

    # 遇到的 bug

    ## 问题

    返回的网页乱码,设定 UTF - 8 无用。

    ## 解决

    一开始在 Header 里设置了 Accept-Encoding 属性。

    <header key="Accept-Encoding" value="gzip, deflate, sdch, br"></header>

    导致返回的是经过编码的网页。删去即可。

    ## 遇到的问题

    HTTP 请求时,服务器不返回数据。

    ## 解决

    在 HTTP 请求的输入流,outputStream 最后输出" ",标明已经发送完毕。

  • 相关阅读:
    深度解析VC中的消息传递机制(上)
    DLL的远程注入技术
    一些游戏编程的书[转]
    [转]小小C的C++之歌
    Windows Server 2008无法使用arp命令添加静态MAC绑定
    如何调用未公开的API函数[转]
    IOCP中的socket错误和资源释放处理方法
    TinyXML应用例子
    微软C/C++ 编译器选项参考
    [摘录]这几本游戏编程书籍你看过吗?
  • 原文地址:https://www.cnblogs.com/Piers/p/6716572.html
Copyright © 2020-2023  润新知