• Linux企业级项目实践之网络爬虫(4)——主程序流程


    当我们设计好程序框架之后就要开始实现它了。第一步当然是要实现主程序的流程框架。之后我们逐渐填充每个流程的细节和其需要调用的模块。

    主程序的流程如下:

    1、  解析命令行参数,并根据参数跳转到相应的处理分支

    2、  解析配置文件

    3、  载入处理模块

    4、  加载种子URL

    5、  启动抓取任务

    主程序的代码如下:

    int main(int argc, void *argv[])
    {
       struct epoll_event events[10];
       int daemonized = 0;
       char ch;
     
       while ((ch = getopt(argc, (char* const*)argv, "vhd")) != -1) {
            switch(ch) {
                case 'v':
                    version();
                    break;
                case 'd':
                    daemonized = 1;
                    break;
                case 'h':
                case '?':
                default:
                    usage();
            }
       }
     
       g_conf = initconfig();
       loadconfig(g_conf);
     
       set_nofile(1024);
     
       vector<char *>::iterator it = g_conf->modules.begin();
       for(; it != g_conf->modules.end(); it++) {
            dso_load(g_conf->module_path, *it);
       }
     
       if (g_conf->seeds == NULL) {
            SPIDER_LOG(SPIDER_LEVEL_ERROR, "Wehave no seeds, Buddy!");
       } else {
            int c = 0;
            char ** splits =strsplit(g_conf->seeds, ',', &c, 0);
            while (c--) {
                Surl * surl = (Surl*)malloc(sizeof(Surl));
                surl->url =url_normalized(strdup(splits[c]));
                surl->level = 0;
                surl->type = TYPE_HTML;
                if (surl->url != NULL)
                    push_surlqueue(surl);
           }
       }       
     
       if (daemonized)
            daemonize();
     
       chdir("download");
     
       int err = -1;
       if ((err = create_thread(urlparser, NULL, NULL, NULL)) < 0) {
            SPIDER_LOG(SPIDER_LEVEL_ERROR,"Create urlparser thread fail: %s", strerror(err));
       }
     
       int try_num = 1;
       while(try_num < 8 && is_ourlqueue_empty())
            usleep((10000 << try_num++));
     
       if (try_num >= 8) {
            SPIDER_LOG(SPIDER_LEVEL_ERROR, "NOourl! DNS parse error?");
       }
     
       if (g_conf->stat_interval > 0) {
            signal(SIGALRM, stat);
            set_ticker(g_conf->stat_interval);
       }
     
       int ourl_num = 0;
       g_epfd = epoll_create(g_conf->max_job_num);
     
       while(ourl_num++ < g_conf->max_job_num) {
            if (attach_epoll_task() < 0)
                break;
       }
     
       int n, i;
       while(1) {
            n = epoll_wait(g_epfd, events, 10,2000);
            printf("epoll:%d
    ",n);
            if (n == -1)
                printf("epollerrno:%s
    ",strerror(errno));
            fflush(stdout);
     
            if (n <= 0) {
                if (g_cur_thread_num <= 0&& is_ourlqueue_empty() && is_surlqueue_empty()) {
                    sleep(1);
                    if (g_cur_thread_num <= 0&& is_ourlqueue_empty() && is_surlqueue_empty())
                        break;
                }
            }
     
            for (i = 0; i < n; i++) {
                evso_arg * arg = (evso_arg*)(events[i].data.ptr);
                if ((events[i].events &EPOLLERR) ||
                    (events[i].events &EPOLLHUP) ||
                    (!(events[i].events &EPOLLIN))) {
                    SPIDER_LOG(SPIDER_LEVEL_WARN,"epoll fail, close socket %d",arg->fd);
                    close(arg->fd);
                    continue;
                }
                epoll_ctl(g_epfd, EPOLL_CTL_DEL,arg->fd, &events[i]); /* del event */
     
                printf("helloepoll:event=%d
    ",events[i].events);
                fflush(stdout);
                create_thread(recv_response, arg,NULL, NULL);
            }
       }
     
       SPIDER_LOG(SPIDER_LEVEL_DEBUG, "Task done!");
       close(g_epfd);
       return 0;
    }


  • 相关阅读:
    compilation debug= true targetframework= 4.0 / configuration error
    Using Temp table in SSIS package
    Using an Excel Destination in SSIS with x64
    SQL Server 中的两个查询级别的Hint NOLOCK和ROWLOCK
    SQL Server中的timeout设置
    Global.asax 转
    VC++动态链接库编程之MFC规则DLL
    堆栈详解(数据与内存中的存储方式) .
    [C++]拷贝构造函数和赋值运算符重载
    #ifdef __cplusplus extern "C" { #endif”的定义的含义 .
  • 原文地址:https://www.cnblogs.com/new0801/p/6177008.html
Copyright © 2020-2023  润新知