• L2R 二:常用评价指标之AUC


    零零散散写了一些,主要是占个坑:

      AUC作为一个常用的评价指标,无论是作为最后模型效果评价还是前期的特征选择,都发挥着不可替代的作用,下面我们详细介绍下这个指标。

      1.定义

      2.实现    

    # coding=utf-8
    # auc值的大小可以理解为: 随机抽一个正样本和一个负样本,正样本预测值比负样本大的概率
    # 根据这个定义,我们可以自己实现计算auc
    
    from sklearn.metrics import roc_curve, auc, roc_auc_score
    import random
    import time
    import sys
    import codecs
    import numpy as np
    
    def timeit(func):
        """
        装饰器,计算函数执行时间
        """
    
        def wrapper(*args, **kwargs):
            time_start = time.time()
            result = func(*args, **kwargs)
            time_end = time.time()
            exec_time = time_end - time_start
            print("{function} exec time: {time}s".format(function=func.__name__, time=exec_time))
            return result
    
        return wrapper
    
    
    def gen_label_pred(n_sample):
        """
        随机生成n个样本的标签和预测值
        """
        labels = [random.randint(0, 1) for _ in range(n_sample)]
        preds = [random.random() for _ in range(n_sample)]
        return labels, preds
    
    
    def load_label_pred(label_file):
    
         with codecs.open(label_file, "r", "utf-8") as f:
            labels = np.array([float(l.strip().split("	")[0]) for l in f.readlines()])
    
         with codecs.open(label_file, "r", "utf-8") as f:
            preds = np.array([float(l.strip().split("	")[1]) for l in f.readlines()])
    
         return labels, preds
    
    @timeit
    def sklearn_auc_api(labels, preds):
        """
        直接调用sklearn包中的结果
        """
        auc = roc_auc_score(labels, preds)
        return auc
        #print("auc:"+str(auc))
    
    
    
    @timeit
    def naive_auc(labels, preds):
        """
        最简单粗暴的方法
       先排序,然后统计有多少正负样本对满足:正样本预测值>负样本预测值, 再除以总的正负样本对个数
         复杂度 O(NlogN), N为样本数
        """
        n_pos = sum(labels)
        n_neg = len(labels) - n_pos
        total_pair = n_pos * n_neg
    
        labels_preds = zip(labels, preds)
        labels_preds = sorted(labels_preds, key=lambda x: x[1])
        accumulated_neg = 0
        satisfied_pair = 0
        for i in range(len(labels_preds)):
            if labels_preds[i][0] == 1:
                satisfied_pair += accumulated_neg
            else:
                accumulated_neg += 1
    
        return satisfied_pair / float(total_pair)
    
    
    
    @timeit
    def approximate_auc(labels, preds, n_bins=100):
        """
        近似方法,将预测值分桶(n_bins),对正负样本分别构建直方图,再统计满足条件的正负样本对
        复杂度 O(N)
        这种方法有什么缺点?怎么分桶?
    
        """
        n_pos = sum(labels)
        n_neg = len(labels) - n_pos
        total_pair = n_pos * n_neg
    
        pos_histogram = [0 for _ in range(n_bins)]
        neg_histogram = [0 for _ in range(n_bins)]
        bin_width = 1.0 / n_bins
        for i in range(len(labels)):
            nth_bin = int(preds[i] / bin_width)
            if labels[i] == 1:
                pos_histogram[nth_bin] += 1
            else:
                neg_histogram[nth_bin] += 1
    
        accumulated_neg = 0
        satisfied_pair = 0
        for i in range(n_bins):
            satisfied_pair += (pos_histogram[i] * accumulated_neg + pos_histogram[i] * neg_histogram[i] * 0.5)
            accumulated_neg += neg_histogram[i]
    
        return satisfied_pair / float(total_pair)
    
    
    if __name__ == "__main__":
        #labels, preds = gen_label_pred(10000000)
        labels, preds = load_label_pred(sys.argv[1])
        naive_auc_rst = naive_auc(labels, preds)
        #approximate_auc_rst = approximate_auc(labels, preds)
        approximate_auc_rst = 0
        sklearn_rst = sklearn_auc_api(labels, preds)
        print("naive auc result:{},approximate auc result:{},sklearn auc result:{}".format(naive_auc_rst, approximate_auc_rst, sklearn_rst))
    
        """
        naive_auc exec time: 31.7306630611s
        approximate_auc exec time: 2.32403683662s
        naive auc result:0.500267265728,approximate auc result:0.50026516844
        """
    

      

      3.应用

  • 相关阅读:
    HTML5实现音频播放
    百度编辑器UEditor常用设置函数大全
    .Net一般处理程序来实现用户名的验证
    软件设计师13-数据库设计
    软件设计师12-数据流图
    百度云BCC安装WordPress镜像
    Java获取客户端真实IP地址
    软件设计师11-面向对象技术
    百度云BCC主机宝镜像
    软件设计师10-系统开发模型
  • 原文地址:https://www.cnblogs.com/zidiancao/p/8779292.html
Copyright © 2020-2023  润新知