一个简单网络爬虫示例(转载)

在学生时期，可能听到网络爬虫这个词会觉得很高大上，但是它的简单实现可能学生都不难懂。
网络爬虫应用，就是把整个互联网真的就当做一张网，像蜘蛛网那样，应用就像一个虫子，在网上面按照一定的规则爬动。
现在互联网应用最广的就是http(s)协议了，本文例子就是基于使用http(s)协议的，只作为示例，不涉及复杂的算法（实际上是最重要的）。

设计思路：
程序入口从一个或多个url开始，通过http(s)获取url的内容，对获取到内容处理，获取内容中需要爬取的信息，获取到内容中的url链接，再重复以上步骤。
不多说，详情看代码已经注释：

/**
 * 功能概要：主程序
 *
 * @author hwz
 */
public class MainApp {

    private Integer corePoolSize = 10;

    private Integer maxPoolSize = 20;

    private ThreadPoolExecutor executor;

    /** 工作队列 */
    private SpiderQueue workQueue;

    public void start(String url) throws Exception {
        //初始化线程池
        LinkedBlockingDeque<Runnable> executorQueue = new LinkedBlockingDeque<Runnable>(maxPoolSize);
        executor = new ThreadPoolExecutor(corePoolSize, maxPoolSize, 60L, TimeUnit.SECONDS, 
                executorQueue);

        workQueue = new SpiderQueue(1024);
        SpiderUrl spiderUrl = new SpiderUrl(url, 0);
        try {
            workQueue.add(spiderUrl);
        }
        catch (Exception e) {
            System.out.println("insert url into workQueue error,url=" + url);
            e.printStackTrace();
        }

        //提交第一个执行任务
       executor.submit(new SimpleSpider(workQueue, "thread-" + "main"));
       int i=0;
       int idle = 0;
       while(true) {
           //判断是否增加更多线程执行任务
           if (workQueue.size() > 20 && executor.getActiveCount() < maxPoolSize) {
               idle = 0;
               System.out.println("submit new thread,workQueue.size=" + workQueue.size() + 
                       ",executorQueue.activeCount=" + executor.getActiveCount() + ",i=" + i);
               executor.submit(new SimpleSpider(workQueue, "thread-" + i++));
               Thread.sleep(500);
           }
           else if (workQueue.size() == 0){
               idle++;
               System.out.println("main method, idle times=" + idle);

               //主线程空闲20次，结束运行
               if (idle > 20) {
                   System.out.println("main method, idle times=" + idle + ",end!");
                   break;
               }
               Thread.sleep(1000);
           }
           else {
               Thread.sleep(2000);
           }
       }
       System.out.println("End!,workQueue.size=" + workQueue.size() + 
                       ",executorQueue.activeCount=" + executor.getActiveCount() + ",executorQueue.CompletedTaskCount" +
               executor.getCompletedTaskCount() +  ",i=" + i);
       workQueue.printAll();
       executor.shutdown();
       System.exit(0);
    }

    public static void main(String[] args) throws Exception {

        MainApp app = new MainApp();
        app.start("http://www.csdn.net/");
    }
}


/**
 * 
 * 功能概要：自定义爬虫工作同步队列，使用ArrayList实现
 *
 * @author hwz
 */publicclass SpiderQueue {/** 存储器 */private List<SpiderUrl> queue;

    publicSpiderQueue(int size) {
        queue = new ArrayList<SpiderUrl>(size);
    }

    publicsynchronizedvoidadd(SpiderUrl spiderUrl) {
        queue.add(spiderUrl);
    }

    publicsynchronized SpiderUrl poll() {
        if (queue.isEmpty()) {
            returnnull;
        }
        //控制台打印结果，方便查看
        SpiderUrl spiderUrl = queue.remove(0);
        System.out.println("SpiderQueue,poll,SpiderUrl=" + spiderUrl.toString() + ",remain size=" + queue.size());
        return spiderUrl;
    }

    publicsynchronized SpiderUrl peek() {
        if (queue.isEmpty()) {
            returnnull;
        }
        return queue.get(0);
    }

    publicsynchronizedbooleanisExsit(SpiderUrl spiderUrl) {
        return queue.contains(spiderUrl);
    }

    publicsynchronizedintsize() {
        return queue.size();
    }

    publicvoidprintAll() {
        System.out.println("Enter printAll.");
        for (SpiderUrl spiderUrl : queue) {
            System.out.println(spiderUrl);
        }
    }
}



/**
 * 
 * 功能概要：爬虫工作的url
 *
 * @author hwz
 */publicclass SpiderUrl {/** http(s) url */private String url;

    /** 该url是入口url的第几层  */privateint deep;

    publicSpiderUrl(String url, int deep) {
        this.url = url;
        this.deep = deep;
    }

    public String getUrl() {
        return url;
    }

    publicvoidsetUrl(String url) {
        this.url = url;
    }

    publicintgetDeep() {
        return deep;
    }

    publicvoidsetDeep(int deep) {
        this.deep = deep;
    }

    @Overridepublicbooleanequals(Object obj) {
        if (!(obj instanceof SpiderUrl)) {
            returnfalse;
        }
        SpiderUrl oth = (SpiderUrl) obj;
        returnthis.url.equals(oth.getUrl());
    }

    @OverridepublicinthashCode() {
        return url.hashCode();
    }

    @Overridepublic String toString() {
        return getClass().toString() + "[url:" + url + ",deep:" + deep +"]";
    }
}

/**
 * 
 * 功能概要：爬虫工作类，主要实现类
 *
 * @author hwz
 */publicclass SimpleSpider implements Runnable{private String threadName;

    private SpiderUrl url;

    private SpiderQueue workQueue;

    publicSimpleSpider(SpiderQueue workQueue, String threadName) {
        this.workQueue = workQueue;
        this.threadName = threadName;
    }

    @Overridepublicvoidrun() {
        System.out.println(threadName + " start run");
        //连续空闲10次循环，结束任务int idle = 0;
        while (idle < 10) {
            url = workQueue.poll();
            if (url != null) {
                //url 解析
                parseUrl(url);
                idle = 0;
            }
            else {
                System.out.println(threadName + " idle...,times=" + idle++);
                try {
                    Thread.sleep(1000);
                }
                catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
        }
        System.out.println(threadName + " end run...");
    }

    /**
     * url解析
     * @param url
     * @return void
     */privatevoidparseUrl(SpiderUrl url) {
        if (url == null) {
            return;
        }
        try {
            int deep = url.getDeep() + 1;
            URL netUrl = new URL(url.getUrl());
            URLConnection connection = netUrl.openConnection();
            String contentType = connection.getContentType();
            //获取内容
            String resource = getResource(connection);
            //获取标题
            String title = getTitle(resource);
            //获取链接
            List<String> urls = getUrls(resource);
            System.out.println(threadName +  ",parseUrl url=" + url + ",contentType=" + contentType + ",title=" + title + ",urls=" + urls);
            //控制爬取链接层数，如果获取到的url全部加入工作队列，将会是指数级增加，最后程序挂掉if (deep < 3) {
                SpiderUrl newUrl;
                for (String u : urls) {
                    newUrl = new SpiderUrl(u,deep);
                    if(!workQueue.isExsit(newUrl)) {
                        workQueue.add(newUrl);
                    }
                }
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * 读取http url 内容
     * @param connection
     * @return
     * @return String
     */private String getResource(URLConnection connection) {
        if (connection == null) {
            returnnull;
        }
        StringBuilder sb = new StringBuilder();
        try {
            InputStream inputStream = connection.getInputStream();
            InputStreamReader isr = new InputStreamReader(inputStream, "UTF-8");
            int input;
            while ( (input = isr.read()) != -1) {
                sb.append((char)input);
            }
        }
        catch (IOException e) {
            System.out.println(threadName + ",get resource error,connection=" + connection);
        }
        return sb.toString();
    }

    /**
     * 从url内容获取标题
     * @param content
     * @return
     * @return String
     */private  String getTitle(String content) {
        if (content == null) {
            returnnull;
        }
        Pattern pattern = Pattern.compile("(<title>.{1,}</title>)");
        Matcher matcher = pattern.matcher(content);
        String title = null;
        if (matcher.find()) {
            title = matcher.group(0).replaceAll("<title>", "").replaceAll("</title>", "");
        }
        return title;
    }

    /**
     * 从url内容中获取存在的url链接
     * @param content
     * @return
     * @return List<String>
     */private  List<String> getUrls(String content) {
        if (content == null) {
            returnnull;
        }
        Pattern pattern = Pattern.compile("(<a.{1,}?href=['"]?[a-zA-z]+:\/\/[^\s]*?[\s>]{1})");
        Matcher matcher = pattern.matcher(content);
        String a;
        String lastChar;
        List<String> links = new ArrayList<String>();
        while (matcher.find()) {
            a = matcher.group(0).replaceAll("<a.{1,}?href=['"]?", "");
            a = a.trim();
            lastChar = a.substring(a.length()-1);
            if (lastChar.equals("'") || lastChar.equals(""") || lastChar.equals(">")) {
                a = a.substring(0,a.length()-1);
            }
            links.add(a);
        }
        return links;
    }
}

在学生时期，可能听到网络爬虫这个词会觉得很高大上，但是它的简单实现可能学生都不难懂。
网络爬虫应用，就是把整个互联网真的就当做一张网，像蜘蛛网那样，应用就像一个虫子，在网上面按照一定的规则爬动。
现在互联网应用最广的就是http(s)协议了，本文例子就是基于使用http(s)协议的，只作为示例，不涉及复杂的算法（实际上是最重要的）。

设计思路：
程序入口从一个或多个url开始，通过http(s)获取url的内容，对获取到内容处理，获取内容中需要爬取的信息，获取到内容中的url链接，再重复以上步骤。
不多说，详情看代码已经注释：

/**
 * 功能概要：主程序
 *
 * @author hwz
 */
public class MainApp {

    private Integer corePoolSize = 10;

    private Integer maxPoolSize = 20;

    private ThreadPoolExecutor executor;

    /** 工作队列 */
    private SpiderQueue workQueue;

    public void start(String url) throws Exception {
        //初始化线程池
        LinkedBlockingDeque<Runnable> executorQueue = new LinkedBlockingDeque<Runnable>(maxPoolSize);
        executor = new ThreadPoolExecutor(corePoolSize, maxPoolSize, 60L, TimeUnit.SECONDS, 
                executorQueue);

        workQueue = new SpiderQueue(1024);
        SpiderUrl spiderUrl = new SpiderUrl(url, 0);
        try {
            workQueue.add(spiderUrl);
        }
        catch (Exception e) {
            System.out.println("insert url into workQueue error,url=" + url);
            e.printStackTrace();
        }

        //提交第一个执行任务
       executor.submit(new SimpleSpider(workQueue, "thread-" + "main"));
       int i=0;
       int idle = 0;
       while(true) {
           //判断是否增加更多线程执行任务
           if (workQueue.size() > 20 && executor.getActiveCount() < maxPoolSize) {
               idle = 0;
               System.out.println("submit new thread,workQueue.size=" + workQueue.size() + 
                       ",executorQueue.activeCount=" + executor.getActiveCount() + ",i=" + i);
               executor.submit(new SimpleSpider(workQueue, "thread-" + i++));
               Thread.sleep(500);
           }
           else if (workQueue.size() == 0){
               idle++;
               System.out.println("main method, idle times=" + idle);

               //主线程空闲20次，结束运行
               if (idle > 20) {
                   System.out.println("main method, idle times=" + idle + ",end!");
                   break;
               }
               Thread.sleep(1000);
           }
           else {
               Thread.sleep(2000);
           }
       }
       System.out.println("End!,workQueue.size=" + workQueue.size() + 
                       ",executorQueue.activeCount=" + executor.getActiveCount() + ",executorQueue.CompletedTaskCount" +
               executor.getCompletedTaskCount() +  ",i=" + i);
       workQueue.printAll();
       executor.shutdown();
       System.exit(0);
    }

    public static void main(String[] args) throws Exception {

        MainApp app = new MainApp();
        app.start("http://www.csdn.net/");
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

/**
 * 
 * 功能概要：自定义爬虫工作同步队列，使用ArrayList实现
 *
 * @author hwz
 */
public class SpiderQueue {

    /** 存储器 */
    private List<SpiderUrl> queue;

    public SpiderQueue(int size) {
        queue = new ArrayList<SpiderUrl>(size);
    }

    public synchronized void add(SpiderUrl spiderUrl) {
        queue.add(spiderUrl);
    }

    public synchronized SpiderUrl poll() {
        if (queue.isEmpty()) {
            return null;
        }
        //控制台打印结果，方便查看
        SpiderUrl spiderUrl = queue.remove(0);
        System.out.println("SpiderQueue,poll,SpiderUrl=" + spiderUrl.toString() + ",remain size=" + queue.size());
        return spiderUrl;
    }

    public synchronized SpiderUrl peek() {
        if (queue.isEmpty()) {
            return null;
        }
        return queue.get(0);
    }

    public synchronized boolean isExsit(SpiderUrl spiderUrl) {
        return queue.contains(spiderUrl);
    }

    public synchronized int size() {
        return queue.size();
    }

    public void printAll() {
        System.out.println("Enter printAll.");
        for (SpiderUrl spiderUrl : queue) {
            System.out.println(spiderUrl);
        }
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

/**
 * 
 * 功能概要：爬虫工作的url
 *
 * @author hwz
 */
public class SpiderUrl {

    /** http(s) url */
    private String url;

    /** 该url是入口url的第几层  */
    private int deep;

    public SpiderUrl(String url, int deep) {
        this.url = url;
        this.deep = deep;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public int getDeep() {
        return deep;
    }

    public void setDeep(int deep) {
        this.deep = deep;
    }

    @Override
    public boolean equals(Object obj) {
        if (!(obj instanceof SpiderUrl)) {
            return false;
        }
        SpiderUrl oth = (SpiderUrl) obj;
        return this.url.equals(oth.getUrl());
    }

    @Override
    public int hashCode() {
        return url.hashCode();
    }

    @Override
    public String toString() {
        return getClass().toString() + "[url:" + url + ",deep:" + deep +"]";
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

/**
 * 
 * 功能概要：爬虫工作类，主要实现类
 *
 * @author hwz
 */
public class SimpleSpider implements Runnable{

    private String threadName;

    private SpiderUrl url;

    private SpiderQueue workQueue;

    public SimpleSpider(SpiderQueue workQueue, String threadName) {
        this.workQueue = workQueue;
        this.threadName = threadName;
    }

    @Override
    public void run() {
        System.out.println(threadName + " start run");
        //连续空闲10次循环，结束任务
        int idle = 0;
        while (idle < 10) {
            url = workQueue.poll();
            if (url != null) {
                //url 解析
                parseUrl(url);
                idle = 0;
            }
            else {
                System.out.println(threadName + " idle...,times=" + idle++);
                try {
                    Thread.sleep(1000);
                }
                catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
        }
        System.out.println(threadName + " end run...");
    }

    /**
     * url解析
     * @param url
     * @return void
     */
    private void parseUrl(SpiderUrl url) {
        if (url == null) {
            return;
        }
        try {
            int deep = url.getDeep() + 1;
            URL netUrl = new URL(url.getUrl());
            URLConnection connection = netUrl.openConnection();
            String contentType = connection.getContentType();
            //获取内容
            String resource = getResource(connection);
            //获取标题
            String title = getTitle(resource);
            //获取链接
            List<String> urls = getUrls(resource);
            System.out.println(threadName +  ",parseUrl url=" + url + ",contentType=" + contentType + ",title=" + title + ",urls=" + urls);
            //控制爬取链接层数，如果获取到的url全部加入工作队列，将会是指数级增加，最后程序挂掉
            if (deep < 3) {
                SpiderUrl newUrl;
                for (String u : urls) {
                    newUrl = new SpiderUrl(u,deep);
                    if(!workQueue.isExsit(newUrl)) {
                        workQueue.add(newUrl);
                    }
                }
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * 读取http url 内容
     * @param connection
     * @return
     * @return String
     */
    private String getResource(URLConnection connection) {
        if (connection == null) {
            return null;
        }
        StringBuilder sb = new StringBuilder();
        try {
            InputStream inputStream = connection.getInputStream();
            InputStreamReader isr = new InputStreamReader(inputStream, "UTF-8");
            int input;
            while ( (input = isr.read()) != -1) {
                sb.append((char)input);
            }
        }
        catch (IOException e) {
            System.out.println(threadName + ",get resource error,connection=" + connection);
        }
        return sb.toString();
    }

    /**
     * 从url内容获取标题
     * @param content
     * @return
     * @return String
     */
    private  String getTitle(String content) {
        if (content == null) {
            return null;
        }
        Pattern pattern = Pattern.compile("(<title>.{1,}</title>)");
        Matcher matcher = pattern.matcher(content);
        String title = null;
        if (matcher.find()) {
            title = matcher.group(0).replaceAll("<title>", "").replaceAll("</title>", "");
        }
        return title;
    }

    /**
     * 从url内容中获取存在的url链接
     * @param content
     * @return
     * @return List<String>
     */
    private  List<String> getUrls(String content) {
        if (content == null) {
            return null;
        }
        Pattern pattern = Pattern.compile("(<a.{1,}?href=['"]?[a-zA-z]+:\/\/[^\s]*?[\s>]{1})");
        Matcher matcher = pattern.matcher(content);
        String a;
        String lastChar;
        List<String> links = new ArrayList<String>();
        while (matcher.find()) {
            a = matcher.group(0).replaceAll("<a.{1,}?href=['"]?", "");
            a = a.trim();
            lastChar = a.substring(a.length()-1);
            if (lastChar.equals("'") || lastChar.equals(""") || lastChar.equals(">")) {
                a = a.substring(0,a.length()-1);
            }
            links.add(a);
        }
        return links;
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

该代码示例，旨在说明一个简单的爬虫，关于多线程和http的处理没有过多考虑，如存在错误，请指出。

相关阅读:
UIScrollerView遇到UINavigationController
iOS 自动化打包
最最基本的SQL常用命令
导入样式表与外部样式表的区别
jdk、jre、JVM的简单区别与联系
JDBC驱动的四种类型
将映射中的值进行排序并输出键
Java优先级队列
Java线程池
Callable--创建有返回值的线程

原文地址：https://www.cnblogs.com/zqyanywn/p/5979971.html