• Linux企业级项目实践之网络爬虫(21)——扩展为多任务爬虫


    高效的网络爬虫是搜索引擎的重要基础。采用多任务并发执行,实现类似于CPU的流水线(pipeline)运行方式,可极大地提高网络和计算资源的利用率等性能。


    #include "threads.h"

    #include "spider.h"
    #include "confparser.h"
     
    /* the number of current running thread */
    int g_cur_thread_num = 0;
    
    /* lock for changing g_cur_thread_num's value */
    pthread_mutex_t gctn_lock = PTHREAD_MUTEX_INITIALIZER;
    
    int create_thread(void *(*start_func)(void *), void * arg, pthread_t *pid, pthread_attr_t * pattr)
    {
        pthread_attr_t attr;
        pthread_t pt;
    
        if (pattr == NULL) {
            pattr = &attr;
            pthread_attr_init(pattr);
            pthread_attr_setstacksize(pattr, 1024*1024);
            pthread_attr_setdetachstate(pattr, PTHREAD_CREATE_DETACHED);
        }
    
        if (pid == NULL)
            pid = &pt;
    
        int rv = pthread_create(pid, pattr, start_func, arg);
        pthread_attr_destroy(pattr);
        return rv;
    }
    
    void begin_thread()
    {
        SPIDER_LOG(SPIDER_LEVEL_DEBUG, "Begin Thread %lu", pthread_self());
    }
    
    void end_thread()
    {
        pthread_mutex_lock(&gctn_lock);	
        int left = g_conf->max_job_num - (--g_cur_thread_num);
        if (left == 1) {
            /* can start one thread */
            attach_epoll_task();
        } else if (left > 1) {
            /* can start two thread */
            attach_epoll_task();
            attach_epoll_task();
        } else {
            /* have reached g_conf->max_job_num , do nothing */
        }
        SPIDER_LOG(SPIDER_LEVEL_DEBUG, "End Thread %lu, cur_thread_num=%d", pthread_self(), g_cur_thread_num);
        pthread_mutex_unlock(&gctn_lock);	
    }
    


  • 相关阅读:
    windows nginx
    stdClass 标准
    array_merge
    array_pop
    array_push
    array_unique
    GMT与UTC简介(转)
    curl-手册
    13.5. zipfile — Work with ZIP archives
    7. Input and Output
  • 原文地址:https://www.cnblogs.com/new0801/p/6176990.html
Copyright © 2020-2023  润新知