• 数据分析与挖掘


    接着案例一,我们再使用另一种方法实例一个案例

    直接上代码:

    #!/usr/bin/Rscript
    
    library(plyr)
    library(reshape2)
    
    #1、根据训练集创建朴素贝叶斯分类器
    #1.1、生成类别的概率
    
    ##计算训练集合D中类别出现的概率,即P{c_i}
    ##输入:trainData 训练集,类型为数据框
    ##      strClassName 指明训练集中名称为    strClassName列为分类结果
    ##输出:数据框,P{c_i}的集合,类别名称|概率(列名为 prob)
    class_prob <- function(trainData, strClassName){
        #训练集样本数
        #nrow返回行数
        length.train <- nrow(trainData)
        dTemp <- ddply(trainData, strClassName, "nrow")
        dTemp <- ddply(dTemp, strClassName, mutate, prob = nrow/length.train)
        dTemp[,-2]
    }
    
    ##1.2、生成每个类别下,特征取不同值的概率
    ##计算训练集合D中,生成每个类别下,特征取不同值的概率,即P{fi|c_i}
    ##输入:trainData 训练集,类型为数据框
    ##      strClassName 指明训练集中名称为strClassName列为分类结果,其余的全部列认为是特征值
    ##输出:数据框,P{fi|c_i}的集合,类别名称|特征名称|特征取值|概率(列名为 prob)
    feature_class_prob <- function(trainData, strClassName){
        # 横表转换为纵表
        data.melt <- melt(trainData,id=c(strClassName))
        # 统计频数
        aa <- ddply(data.melt, c(strClassName,"variable","value"), "nrow")
        # 计算概率
        bb <- ddply(aa, c(strClassName,"variable"), mutate, sum = sum(nrow), prob = nrow/sum)
        # 增加列名
        colnames(bb) <- c("class.name",
                        "feature.name",
                        "feature.value",
                        "feature.nrow",
                        "feature.sum",
                        "prob")
        # 返回结果
        bb[,c(1,2,3,6)]
    }
    
    ## 以上创建完朴素贝叶斯分类器
    
    ## 2、使用生成的朴素贝叶斯分类器进行预测 ##使用生成的朴素贝叶斯分类器进行预测P{fi|c_i} ##输入:oneObs 数据框,待预测的样本,格式为 特征名称|特征值 ## pc 数据框,训练集合D中类别出现的概率,即P{c_i} 类别名称|概率 ## pfc 数据框,每个类别下,特征取不同值的概率,即P{fi|c_i} ## 类别名称|特征名称|特征值|概率 ##输出:数据框,待预测样本的分类对每个类别的概率,类别名称|后验概率(列名为 prob) pre_class <- function(oneObs, pc,pfc){ colnames(oneObs) <- c("feature.name", "feature.value") colnames(pc) <- c("class.name","prob") colnames(pfc) <- c("class.name","feature.name","feature.value","prob") # 取出特征的取值的条件概率 feature.all <- join(oneObs,pfc,by=c("feature.name","feature.value"),type="inner") # 取出特征取值的条件概率连乘 feature.prob <- ddply(feature.all,.(class.name),summarize,prob_fea=prod(prob)) #prod为连乘函数 #取出类别的概率 class.all <- join(feature.prob,pc,by="class.name",type="inner") #输出结果 ddply(class.all,.(class.name),mutate,pre_prob=prob_fea*prob)[,c(1,4)] } ##3、数据测试 ##用上面苹果的数据作为例子进行测试 #训练集 train.apple <-data.frame( size=c("","","","","",""), weight=c("","","","","",""), color=c("","","","绿","","绿"), taste=c("good","good","bad","bad","bad","good") ) #待预测样本 oneObs<-data.frame( feature.name =c("size", "weight", "color"), feature.value =c("","","") ) #预测分类 pc <- class_prob(train.apple,"taste") pfc <- feature_class_prob(train.apple,"taste") pre_class(oneObs, pc, pfc)

    预测结果为:

    class.name pre_prob
    1 bad 0.07407407
    2 good 0.03703704

    可见该苹果的口味为:bad

    *********************************************这里是分割线****************************************************

    我们使用这个方法再预测一下案例一中的数据集。

    #数据集样本
    data <- data.frame(c("sunny","hot","high","weak","no",  
                     "sunny","hot","high","strong","no",  
                     "overcast","hot","high","weak","yes",  
                     "rain","mild","high","weak","yes",  
                     "rain","cool","normal","weak","yes",  
                     "rain","cool","normal","strong","no",  
                     "overcast","cool","normal","strong","yes",  
                     "sunny","mild","high","weak","no",  
                     "sunny","cool","normal","weak","yes",  
                     "rain","mild","normal","weak","yes",  
                     "sunny","mild","normal","strong","yes",  
                     "overcast","mild","high","strong","yes",  
                     "overcast","hot","normal","weak","yes",  
                     "rain","mild","high","strong","no"), 
                     byrow = TRUE,
                     dimnames = list(day = c(),condition = c("outlook","temperature","humidity","wind","playtennis")), 
                     nrow=14, 
                     ncol=5);  
    
    #待预测样本
    ddata<-data.frame(
        feature.name =c("outlook", "temperature","humidity","wind"),
        feature.value =c("overcast","mild","normal","weak")
    )
    
    
    #预测分类
    pc <- class_prob(data,"playtennis")
    pfc <- feature_class_prob(data,"playtennis")
    pre_class(ddata, pc, pfc)

    预测结果为:

    class.name   pre_prob
    1         no 0.02666667
    2        yes 0.13168724

    预测结果为:yes,可见与案例一的结果一样。

  • 相关阅读:
    利用Dom4j创建xml文档
    eclipse安装、使用hibernate插件方法
    “万能数据库查询分析器”用户已基本涵盖当前所有DBMS
    在Dom4j中使用Xpath搜索xml的元素节点
    多线程的一些小问题集锦
    Dalvik虚拟机的运行过程分析
    创建线程的两种方式
    通过xpath查找指定的节点
    Oracle TNS 配置
    c++五种内存分配、堆与栈区别
  • 原文地址:https://www.cnblogs.com/hunttown/p/5526716.html
Copyright © 2020-2023  润新知