linux c++爬虫（一）

  1 int main(int argc, void *argv[]) 
  2 {
  3     struct epoll_event events[10];
  4     int daemonized = 0;
  5     char ch;
  6  
  7     
  8     while ((ch = getopt(argc, (char* const*)argv, "vhd")) != -1) {
  9         switch(ch) {
 10             case 'v':
 11                 version();
 12                 break;
 13             case 'd':
 14                 daemonized = 1;
 15                 break;
 16             case 'h':
 17             case '?':
 18             default:
 19                 usage();
 20         }
 21     }
 22  
 23     
 24     g_conf = initconfig();
 25     loadconfig(g_conf);
 26  
 27     
 28     set_nofile(1024); 
 29  
 30     
 31     vector<char *>::iterator it = g_conf->modules.begin();
 32     for(; it != g_conf->modules.end(); it++) {
 33         dso_load(g_conf->module_path, *it); 
 34     } 
 35  
 36     
 37     if (g_conf->seeds == NULL) {
 38         SPIDER_LOG(SPIDER_LEVEL_ERROR, "We have no seeds!");
 39     } else {
 40         int c = 0;
 41         char ** splits = strsplit(g_conf->seeds, ',', &c, 0);
 42         while (c--) {
 43             Surl * surl = (Surl *)malloc(sizeof(Surl));
 44             surl->url = url_normalized(strdup(splits[c]));
 45             surl->level = 0;
 46             surl->type = TYPE_HTML;
 47             if (surl->url != NULL)
 48                 push_surlqueue(surl);
 49         }
 50     }   
 51  
 52     
 53     if (daemonized)
 54         daemonize();
 55  
 56    
 57     chdir("download"); 
 58  
 59     
 60     int err = -1;
 61     if ((err = create_thread(urlparser, NULL, NULL, NULL)) < 0) {
 62         SPIDER_LOG(SPIDER_LEVEL_ERROR, "Create urlparser thread fail: %s", strerror(err));
 63     }
 64  
 65     /* waiting seed ourl ready */
 66     int try_num = 1;
 67     while(try_num < 8 && is_ourlqueue_empty())
 68         usleep((10000 << try_num++));
 69  
 70     if (try_num >= 8) {
 71         SPIDER_LOG(SPIDER_LEVEL_ERROR, "NO ourl! DNS parse error?");
 72     }
 73  
 74     /* set ticker  */
 75     if (g_conf->stat_interval > 0) {
 76         signal(SIGALRM, stat);
 77         set_ticker(g_conf->stat_interval);
 78     }
 79  
 80     /* begin create epoll to run */
 81     int ourl_num = 0;
 82     g_epfd = epoll_create(g_conf->max_job_num);
 83  
 84     while(ourl_num++ < g_conf->max_job_num) {
 85         if (attach_epoll_task() < 0)
 86             break;
 87     }
 88  
 89     /* epoll wait */
 90     int n, i;
 91     while(1) {
 92         n = epoll_wait(g_epfd, events, 10, 2000);
 93         printf("epoll:%d
",n);
 94         if (n == -1)
 95             printf("epoll errno:%s
",strerror(errno));
 96         fflush(stdout);
 97  
 98         if (n <= 0) {
 99             if (g_cur_thread_num <= 0 && is_ourlqueue_empty() && is_surlqueue_empty()) {
100                 sleep(1);
101                 if (g_cur_thread_num <= 0 && is_ourlqueue_empty() && is_surlqueue_empty())
102                     break;
103             }
104         }
105  
106         for (i = 0; i < n; i++) {
107             evso_arg * arg = (evso_arg *)(events[i].data.ptr);
108             if ((events[i].events & EPOLLERR) ||
109                 (events[i].events & EPOLLHUP) ||
110                 (!(events[i].events & EPOLLIN))) {
111                 SPIDER_LOG(SPIDER_LEVEL_WARN, "epoll fail, close socket %d",arg->fd);
112                 close(arg->fd);
113                 continue;
114             }
115  
116             epoll_ctl(g_epfd, EPOLL_CTL_DEL, arg->fd, &events[i]); /* del event */
117  
118             printf("hello epoll:event=%d
",events[i].events);
119             fflush(stdout);
120             create_thread(recv_response, arg, NULL, NULL);
121         }
122     }
123  
124     SPIDER_LOG(SPIDER_LEVEL_DEBUG, "Task done!");
125     close(g_epfd);
126     return 0;
127 }

本项目主要进行网页的抓取，上述为主控制模块

 while ((ch = getopt(argc, (char* const*)argv, "vhd")) != -1) {

主要作用为命令行参数的解析，根据命令行参数我们判断是一些额外输出信息和以什么方式进行（ps：守护进成）

 24     g_conf = initconfig();
 25     loadconfig(g_conf);


进行初始化配置，对log配置进行加载，
log配置包含了一些抓取深度，种子，动态库路径等等之类的信息
下面主要是一些需要抓取前加载的配置文件

cur_thread_num.
max_job_num=1
seeds=http://www.imeiding.com
logfile=spiderq.log

# Set the level to log. The probable values list as follow:
#   0 DEBUG
#   1 INFO
#   2 WARN
#   3 ERROR
#   4 CRIT
log_level=0

max_depth=0

module_path=/etc/spider/modules/

load_module=savehtml
load_module=saveimage
load_module=maxdepth
load_module=domainlimit
load_module=headerfilter


# specify which type of resource we accept. Each one a line.
# text/html is accepted default
accept_types=image/jpeg

我们将动态库都存在vector里面，以便后续使用
但是在读取配置文件的时候我们不要忘记字符串的处理，比如，空行，注释行#，空格，=划分等等问题

接下来设置守护进程，以便使任务脱离终端控制，

创建线程，通过libevent进行dns解析，，开启epoll任务，向epoll中注册事件，模式为ET模式，不断的等待内核中epoll事件的触发并进行处理

通过开启线程进行http请求，手写http头部，进行发送给server端一个http请求报文

http协议请求页面时的流程：

1、输入网址

2、向DNS发送解析请求

3、 DNS返回给我们一个对应的IP地址

4、通过IP地址向资源所在的主机发送请求

5、如果资源存在，主机返回200状态，同时返回数据部分

6、本地http客户端（一般来说是浏览器）接收数据

7、得到资源

得到http接受报文的时候，对http接收报文进行解析，解析内部的url并放入队列中，并对http接收报文进行持久化操作

相关阅读:
Web前端之jQuery 的10大操作技巧
 Python开发者须知 —— Bottle框架常见的几个坑
 string、const char*、 char* 、char[]相互转换
 SLAM中的变换（旋转与位移）表示方法
 SLAM
二叉搜索树(BST)
Linux下OSG的编译和安装以及遇到的问题
 CMake--Set用法
 CMake--List用法
 php面向对象面试题
原文地址：https://www.cnblogs.com/13224ACMer/p/6864447.html