• 回归_英国酒精和香烟关系


    sklearn实战-乳腺癌细胞数据挖掘(博客主亲自录制视频教程)

    https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

    数据统计分析联系:QQ:231469242

    英国酒精和香烟官网

    http://lib.stat.cmu.edu/DASL/Stories/AlcoholandTobacco.html

    Story Name: Alcohol and TobaccoImage: Scatterplot of Alcohol vs. Tobacco, with Northern Ireland marked with a blue X.

    Story Topics: Consumer , HealthDatafile Name: Alcohol and TobaccoMethods: Correlation , Dummy variable , Outlier , Regression , ScatterplotAbstract: Data from a British government survey of household spending may be used to examine the relationship between household spending on tobacco products and alcholic beverages. A scatterplot of spending on alcohol vs. spending on tobacco in the 11 regions of Great Britain shows an overall positive linear relationship with Northern Ireland as an outlier. Northern Ireland's influence is illustrated by the fact that the correlation between alcohol and tobacco spending jumps from .224 to .784 when Northern Ireland is eliminated from the dataset.

    This dataset may be used to illustrate the effect of a single influential observation on regression results. In a simple regression of alcohol spending on tobacco spending, tobacco spending does not appear to be a significant predictor of tobacco spending. However, including a dummy variable that takes the value 1 for Northern Ireland and 0 for all other regions results in significant coefficients for both tobacco spending and the dummy variable, and a high R-squared.

    两个模块算出的R平方值一样的

    # -*- coding: utf-8 -*-
    """
    python3.0
    Alcohol and Tobacco 酒精和烟草的关系
    http://lib.stat.cmu.edu/DASL/Stories/AlcoholandTobacco.html
    很多时候,数据读写不一定是文件,也可以在内存中读写。
    StringIO顾名思义就是在内存中读写str。
    要把str写入StringIO,我们需要先创建一个StringIO,然后,像文件一样写入即可
    """
    
    import numpy as np
    import pandas as pd
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import statsmodels.formula.api as sm
    from sklearn.linear_model import LinearRegression
    from scipy import stats
    
    
    list_alcohol=[6.47,6.13,6.19,4.89,5.63,4.52,5.89,4.79,5.27,6.08,4.02]
    list_tobacco=[4.03,3.76,3.77,3.34,3.47,2.92,3.20,2.71,3.53,4.51,4.56]
    plt.plot(list_tobacco,list_alcohol,'ro')
    plt.ylabel('Alcohol')
    plt.ylabel('Tobacco')
    plt.title('Sales in Several UK Regions')
    plt.show()
    
    data=pd.DataFrame({'Alcohol':list_alcohol,'Tobacco':list_tobacco})
    
    result = sm.ols('Alcohol ~ Tobacco', data[:-1]).fit()
    print(result.summary())
    

    python2.7

    # -*- coding: utf-8 -*-
    #斯皮尔曼等级相关(Spearman’s correlation coefficient for ranked data)
    import numpy as np
    import scipy.stats as stats
    from scipy.stats import f
    import pandas as pd
    import matplotlib.pyplot as plt
    from statsmodels.stats.diagnostic import lillifors
    import normality_check
    
    
    y=[6.47,6.13,6.19,4.89,5.63,4.52,5.89,4.79,5.27,6.08]
    x=[4.03,3.76,3.77,3.34,3.47,2.92,3.20,2.71,3.53,4.51]
    list_group=[x,y]
    sample=len(x)
    
    
    #数据可视化
    plt.plot(x,y,'ro')
    #斯皮尔曼等级相关,非参数检验
    def Spearmanr(x,y):
        print"use spearmanr,Nonparametric tests"
        #样本不一致时,发出警告
        if len(x)!=len(y):
            print "warming,the samples are not equal!"
        r,p=stats.spearmanr(x,y)
        print"spearman r**2:",r**2
        print"spearman p:",p
        if sample<500 and p>0.05:
            print"when sample < 500,p has no mean(>0.05)"
            print"when sample > 500,p has mean"
        
        
    #皮尔森 ,参数检验
    def Pearsonr(x,y):
        print"use Pearson,parametric tests"
        r,p=stats.pearsonr(x,y)
        print"pearson r**2:",r**2
        print"pearson p:",p
        if sample<30:
            print"when sample <30,pearson has no mean"
    
    #kendalltau非参数检验
    def Kendalltau(x,y):
        print"use kendalltau,Nonparametric tests"
        r,p=stats.kendalltau(x,y)
        print"kendalltau r**2:",r**2
        print"kendalltau p:",p
        
    
    #选择模型
    def mode(x,y):
        #正态性检验
        Normal_result=normality_check.NormalTest(list_group)
        print "normality result:",Normal_result
        if len(list_group)>2:
            Kendalltau(x,y)
        if Normal_result==False:
            Spearmanr(x,y)
            Kendalltau(x,y)
        if Normal_result==True:   
            Pearsonr(x,y)
            
    
    mode(x,y)
    '''
    x=[50,60,70,80,90,95]
    y=[500,510,530,580,560,1000]
    use shapiro:
    data are normal distributed
    use shapiro:
    data are not normal distributed
    normality result: False
    use spearmanr,Nonparametric tests
    spearman r: 0.942857142857
    spearman p: 0.00480466472303
    use kendalltau,Nonparametric tests
    kendalltau r: 0.866666666667
    kendalltau p: 0.0145950349193
    
    #肯德尔系数测试
    x=[3,5,2,4,1]
    y=[3,5,2,4,1]
    z=[3,4,1,5,2]
    h=[3,5,1,4,2]
    k=[3,5,2,4,1]
    '''
    

     python2.7

    # -*- coding: utf-8 -*-
    '''
    Author:Toby
    QQ:231469242,all right reversed,no commercial use
    normality_check.py
    正态性检验脚本
     
    '''
     
    import scipy
    from scipy.stats import f
    import numpy as np
    import matplotlib.pyplot as plt
    import scipy.stats as stats
    # additional packages
    from statsmodels.stats.diagnostic import lillifors
     
    
     
     
    #正态分布测试
    def check_normality(testData):
        #20<样本数<50用normal test算法检验正态分布性
        if 20<len(testData) <50:
           p_value= stats.normaltest(testData)[1]
           if p_value<0.05:
               print"use normaltest"
               print "data are not normal distributed"
               return  False
           else:
               print"use normaltest"
               print "data are normal distributed"
               return True
         
        #样本数小于50用Shapiro-Wilk算法检验正态分布性
        if len(testData) <50:
           p_value= stats.shapiro(testData)[1]
           if p_value<0.05:
               print "use shapiro:"
               print "data are not normal distributed"
               return  False
           else:
               print "use shapiro:"
               print "data are normal distributed"
               return True
           
        if 300>=len(testData) >=50:
           p_value= lillifors(testData)[1]
           if p_value<0.05:
               print "use lillifors:"
               print "data are not normal distributed"
               return  False
           else:
               print "use lillifors:"
               print "data are normal distributed"
               return True
         
        if len(testData) >300: 
           p_value= stats.kstest(testData,'norm')[1]
           if p_value<0.05:
               print "use kstest:"
               print "data are not normal distributed"
               return  False
           else:
               print "use kstest:"
               print "data are normal distributed"
               return True
     
     
    #对所有样本组进行正态性检验
    def NormalTest(list_groups):
        for group in list_groups:
            #正态性检验
            status=check_normality(group)
            if status==False :
                return False
        return True
                 
    
    '''
    group1=[2,3,7,2,6]
    group2=[10,8,7,5,10]
    group3=[10,13,14,13,15]
    list_groups=[group1,group2,group3]
    list_total=group1+group2+group3
    #对所有样本组进行正态性检验   
    NormalTest(list_groups)
    '''
    

    python风控评分卡建模和风控常识(博客主亲自录制视频教程)

  • 相关阅读:
    C# 根据URL获取网页截屏
    Django——WEB应用程序(手写程序),HTTP协议,BS CS架构
    jQuery——标签操作之(样式、文本内容、属性、文档处理)操作
    jQuery——简介,使用
    jQuery下载及应用
    javaScript——案例演示:点击有惊喜
    javaScript——案例演示:弹出模态框
    JavaScript——DOM操作+案例演示
    JavaScript——BOM操作
    JavaScript——杂碎小知识
  • 原文地址:https://www.cnblogs.com/webRobot/p/7140749.html
Copyright © 2020-2023  润新知