一只简单的网络爬虫（基于linux C/C++）————配置文件设计及读取

一般来说linux下比较大型的程序都是以配置文件作为参数介质传递的，该爬虫也采用配置文件的方式来获取参数，配置文件格式大致如下：

max_job_num=1
#seeds=https://www.baidu.com
#seeds=http://bbs.scut.edu.cn/classic/
logfile=spiderq.log

以“=”作为分割符，左边为key，右边为valve，主要包含的参数内容为：最大的任务数，种子，日志文件，日志等级，深度，模块路径，支持的文件类型，等，以“#”开头的就作为注释。
配置文件结构体如下：

Config * initconfig()
{
    Config *conf = (Config *)malloc(sizeof(Config));

    conf->max_job_num = 10;
    conf->seeds = NULL;
    conf->include_prefixes = NULL;
    conf->exclude_prefixes = NULL;
    conf->logfile = NULL;
    conf->log_level = 0;
    conf->max_depth = INF;
    conf->make_hostdir = 0;
    conf->module_path = NULL;
    conf->stat_interval = 0;
    //conf->modules

    return conf;
}

是一个全局的结构体，若以类的方式实现则可以设计成单例类，里面的参数主要保存从配置文件中读取到的东西。加载配置文件后，配置文件的参数设置被保存到了Config结构体的，供后面的函数使用
加载配置文件的函数如下：

void loadconfig(Config *conf)
{

    FILE *fp = NULL;
    char buf[MAX_CONF_LEN+1];
    int argc = 0;
    char **argv = NULL;
    int linenum = 0;
    char *line = NULL;
    const char *err = NULL;

    if ((fp = fopen(CONF_FILE, "r")) == NULL) //打开配置文件
    {
        SPIDER_LOG(SPIDER_LEVEL_ERROR, "Can't load conf_file %s", CONF_FILE);   
    } 

    while (fgets(buf, MAX_CONF_LEN+1, fp) != NULL)//获取一行到buf
    {
        linenum++;
        line = strim(buf);//去除空格，在qstring中
        //过滤#
        if (line[0] == '#' || line[0] == '') continue;
        //count: 分割后的字符串长度
        //limit: 分割多少次
        argv = strsplit(line, '=', &argc, 1);
        if (argc == 2) {
            if (strcasecmp(argv[0], "max_job_num") == 0) {
                conf->max_job_num = atoi(argv[1]);
            } 
//extern char *strdup(char *s);
//功 能: 将串拷贝到新建的位置处
//strdup()在内部调用了malloc()为变量分配内存，
//不需要使用返回的字符串时，需要用free()释放相应的内存空间，
//否则会造成内存泄漏
            //strcasecmp（忽略大小写比较字符串）
            else if (strcasecmp(argv[0], "logfile") == 0) {
                conf->logfile = strdup(argv[1]);
            } else if (strcasecmp(argv[0], "include_prefixes") == 0) {
                conf->include_prefixes = strdup(argv[1]);
            } else if (strcasecmp(argv[0], "exclude_prefixes") == 0) {
                conf->exclude_prefixes = strdup(argv[1]);
            } else if (strcasecmp(argv[0], "seeds") == 0) {
                conf->seeds = strdup(argv[1]);
            } else if (strcasecmp(argv[0], "module_path") == 0) {
                conf->module_path = strdup(argv[1]);
            } else if (strcasecmp(argv[0], "load_module") == 0) {
                conf->modules.push_back(strdup(argv[1]));
            } else if (strcasecmp(argv[0], "log_level") == 0) {
                conf->log_level = atoi(argv[1]);
            } else if (strcasecmp(argv[0], "max_depth") == 0) {
                conf->max_depth = atoi(argv[1]);
            } else if (strcasecmp(argv[0], "stat_interval") == 0) {
                conf->stat_interval = atoi(argv[1]);
            } else if (strcasecmp(argv[0], "make_hostdir") == 0) {
                conf->make_hostdir = yesnotoi(argv[1]);
            } else if (strcasecmp(argv[0], "accept_types") == 0) {
                conf->accept_types.push_back(strdup(argv[1]));
            } else {
                err = "Unknown directive"; goto conferr;
            }
        } else {
            err = "directive must be 'key=value'"; goto conferr;
        }

    }

    return;

conferr:
    SPIDER_LOG(SPIDER_LEVEL_ERROR, "Bad directive in %s[line:%d] %s", CONF_FILE, linenum, err); 
}

下面介绍几个函数：

（1）fgets函数是用来获取行的，该函数可读取文件中的一行，并且会包含换行符
（2）strim函数用于去除空格，其实现如下：参数是行指针

//去除空格
char * strim(char *str)
{
    char *end, *sp, *ep;
    size_t len;

    sp = str;
    end = ep = str+strlen(str)-1;
    //从行首开始
    while(sp <= end && isspace(*sp)) sp++;
    //从行尾开始
    while(ep >= sp && isspace(*ep)) ep--;
    //计算长度
    len = (ep < sp) ? 0 : (ep-sp)+1;
    sp[len] = '';
    return sp;
}

（3）strsplit函数用于分割字符串，实现方法如下：

//切割字符串，函数内申请，函数外释放
//count: 分割后的字符串长度
//limit: 分割多少次
char ** strsplit(char *line, char delimeter, int *count, int limit)
{
    char *ptr = NULL, *str = line;
    char **vector = NULL;

    *count = 0;
    //strchr函数原型：extern char *strchr(const char *s,char c);查找字符串s中首次出现字符c的位置。
    while((ptr = strchr(str, delimeter))) 
    {
        *ptr = '';
        vector = (char **)realloc(vector,((*count)+1)*sizeof(char *));
        vector[*count] = strim(str);//等号前面部分去除空格后放入vector
        str = ptr+1;//此时str指向等号后面部分（会继续查找后面的等号）
        (*count)++; 
        if (--limit == 0) break;
    }
    if (*str != '') 
    {
        vector = (char **)realloc(vector,((*count)+1)*sizeof(char *));
        vector[*count] = strim(str);//分割空格
        (*count)++;
    }
    return vector;
}

（4）strcasecmp函数是忽略大小写比较字符串，该函数用来比较配置文件的key值
定义函数 int strcasecmp (const char *s1, const char *s2);
函数说明 strcasecmp()用来比较参数s1和s2字符串，比较时会自动忽略大小写的差异。
返回值若参数s1和s2字符串相等则返回0。s1大于s2则返回大于0 的值，s1 小于s2 则返回小于0的值。
（5）strdup函数
extern char *strdup(char *s);
功能: 将串拷贝到新建的位置处，strdup()在内部调用了malloc()为变量分配内存，不需要使用返回的字符串时，需要用free()释放相应的内存空间，否则会造成内存泄漏

相关阅读:
遗弃.Forsaken.2015.BluRay.720p.x264.DTS-beAst
三体
 Hexo博客maupassant主题添加Google Adsense广告
 拼凑自定义控件
 Redis 的安装和使用
 ES6中的数组
 机器学习 —— 数据预处理
 吴裕雄 Bootstrap 前端框架开发——Bootstrap 字体图标(Glyphicons)：glyphicon glyphicon-file
吴裕雄 Bootstrap 前端框架开发——Bootstrap 字体图标(Glyphicons)：glyphicon glyphicon-home
吴裕雄 Bootstrap 前端框架开发——Bootstrap 字体图标(Glyphicons)：glyphicon glyphicon-trash
原文地址：https://www.cnblogs.com/sigma0-/p/12630472.html