• libsvm数据缩放方法


    assumption: min index of attributes is 1 
    pass 1: find out max index of attributes :

    1.1也就是找出每行有多少个特征数据,因为libsvm特征格式中每个特征前面带有下标,缺失的认为是0,这样避免稀疏矩阵,以提高计算速度。其实我获取的数据即便是0值也进行了保存,如果在保存特征时进行0值判断的话,会变得有点麻烦,也就简单化处理。这是以后可以修改的一个地方。

    1.2创建保存最值的数组,并初始化。

    if(restore_filename)
    	{
    		int idx, c;
    		fp_restore = fopen(restore_filename,"r");
    		c = fgetc(fp_restore);
    		if(c == 'y')
    		{
    			readline(fp_restore);
    			readline(fp_restore);
    			readline(fp_restore);
    		}
    		readline(fp_restore);
    		readline(fp_restore);
    		while(fscanf(fp_restore,"%d %*f %*f\n",&idx) == 1)
    			max_index = max(idx,max_index);
    		rewind(fp_restore);
    	}
    	while(readline(fp)!=NULL)
    	{
    		char *p=line;
    
    		SKIP_TARGET
    
    		while(sscanf(p,"%d:%*f",&index)==1)
    		{
    			max_index = max(max_index, index);
    			SKIP_ELEMENT
    			num_nonzeros++;
    		}		
    	}
    	rewind(fp);
    //创建保存最值的数组
    	feature_max = (double *)malloc((max_index+1)* sizeof(double));
    	feature_min = (double *)malloc((max_index+1)* sizeof(double));
    	if(feature_max == NULL || feature_min == NULL)
    	{
    		fprintf(stderr,"can't allocate enough memory\n");
    		exit(1);
    	}
    //初始化
    	for(i=0;i<=max_index;i++)
    	{
    		feature_max[i]=-DBL_MAX;
    		feature_min[i]=DBL_MAX;
    	}

    pass 2: find out min/max value,找出每行中的最大与最小值,并传递到相应数组。

    while(readline(fp)!=NULL)
    	{
    		char *p=line;
    		int next_index=1;
    		double target;
    		double value;
    
    		sscanf(p,"%lf",&target);
    		y_max = max(y_max,target);
    		y_min = min(y_min,target);
    		
    		SKIP_TARGET
    
    		while(sscanf(p,"%d:%lf",&index,&value)==2)
    		{
    			for(i=next_index;i<index;i++)
    			{
    				feature_max[i]=max(feature_max[i],0);
    				feature_min[i]=min(feature_min[i],0);
    			}
    			
    			feature_max[index]=max(feature_max[index],value);
    			feature_min[index]=min(feature_min[index],value);
    
    			SKIP_ELEMENT
    			next_index=index+1;
    		}		
    
    		for(i=next_index;i<=max_index;i++)
    		{
    			feature_max[i]=max(feature_max[i],0);
    			feature_min[i]=min(feature_min[i],0);
    		}	
    	}
    
    	rewind(fp);
    

      pass 3: scale 缩放

    while(readline(fp)!=NULL)
    	{
    		char *p=line;
    		int next_index=1;
    		double target;
    		double value;
    		sscanf(p,"%lf",&target);
    		output_target(target);
    		SKIP_TARGET
    		while(sscanf(p,"%d:%lf",&index,&value)==2)
    		{
    			for(i=next_index;i<index;i++)
    				output(i,0);
    			
    			output(index,value);
    
    			SKIP_ELEMENT
    			next_index=index+1;
    		}		
    		for(i=next_index;i<=max_index;i++)
    			output(i,0);
    
    		printf("\n");
    	}
    

      

    void output_target(double value)
    {
    	if(y_scaling)
    	{
    		if(value == y_min)
    			value = y_lower;
    		else if(value == y_max)
    			value = y_upper;
    		else value = y_lower + (y_upper-y_lower) *
    			     (value - y_min)/(y_max-y_min);
    	}
    	printf("%g ",value);
    }
    

      

    效果:消除了奇异样本数据对处理过程的影响。

  • 相关阅读:
    react native配置ip真机测试
    APP Store上架QA&注意事项
    iOS 开发】解决使用 CocoaPods 执行 pod install 时出现
    iphoneX适配!!!
    better-scroll和swiper使用中的坑
    js知识巩固
    vue的学习(常用功能)
    vue学习生命周期(created和mounted区别)
    jq常用功能操作
    移动端中遇到的坑(bug)!!!
  • 原文地址:https://www.cnblogs.com/xiangshancuizhu/p/2241406.html
Copyright © 2020-2023  润新知