• 大作业


    一、boston房价预测



    from
    sklearn.datasets import load_boston boston = load_boston() boston.keys()

    #划分训练集与测试集
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(boston.data,boston.target,test_size=0.3) 
    print(x_train.shape,y_train.shape)

    #一元线性回归模型
    from sklearn.linear_model import LinearRegression
    LineR = LinearRegression()
    LineR.fit(x_train,y_train)
    print("系数:",LineR.coef_)       #系数
    print("截距:",LineR.intercept_)        #截距

    #检测模型好坏
    from sklearn.metrics import regression
    y_pred = LineR.predict(x_test)   #预测
    n = regression.mean_squared_error(y_test,y_pred) #预测模型均方误差
    print("线性回归均方误差:",n)
    a = regression.mean_absolute_error(y_test,y_pred)
    print("线性回归平均绝对误差",a)
    s = LineR.score(x_test,y_test)  #模型分数
    print("线性回归模型分数:",s)

    #多元多项式回归模型
    from sklearn.preprocessing import PolynomialFeatures
    
    poly = PolynomialFeatures(degree = 2)
    x_poly_train= poly.fit_transform(x_train) #多项式化
    x_poly_test  = poly.transform(x_test)
    
    #建立模型
    LineR2 = LinearRegression()
    LineR2.fit(x_poly_train,y_train)
    
    
    y_pred2 =LineR2.predict(x_poly_test)#预测
    #检测模型好坏
    n2 = regression.mean_squared_error(y_test,y_pred2)
    print("多项式回归均方误差:",n2)
    a2 = regression.mean_absolute_error(y_test,y_pred2)
    print("多项式回归平均绝对误差",a2)
    s2 = LineR2.score(x_poly_test,y_test)
    print("多项式回归模型分数:",s2)

    #比较线性模型与非线性模型的性能,并说明原因
    非线性模型功能强,因为多项式模型是曲线,更符合与样本点的分布,而且多项式回归模型误差更小。

     二、中文文本分类



    import os import numpy as np import sys from datetime import datetime import gc path = 'E:\258' list = os.listdir(path) list

    def list_all_files(rootdir):
        import os
        _files = []
        list = os.listdir(rootdir) #列出文件夹下所有的目录与文件
        for i in range(0,len(list)):
               path = os.path.join(rootdir,list[i])
               if os.path.isdir(path):
                  _files.extend(list_all_files(path))
               if os.path.isfile(path):
                  _files.append(path)
        return _files
    _fs = list_all_files('E:\258')
    #所有文件遍历出来
    _fs

    !pip install jieba
    import jieba
    with open(r'E:\stopsCN.txt',encoding='utf-8') as f:
        stopwords = f.read().split('
    ')
    import jieba
    def processing(tokens):
        
        tokens = "".join([char for char in tokens if char.isalpha()])# 去掉非字母汉字的字符
        
        tokens = [token for token in jieba.cut(tokens,cut_all=True) if len(token) >=2]# 结巴分词
        
        tokens = " ".join([token for token in tokens if token not in stopwords])# 去掉停用词
        return tokens
    tokenList = []
    targetList = []
    for root,dirs,files in os.walk(path):#拼接路径,再打开文件
        for f in files:
            filePath = os.path.join(root,f)
            with open(filePath,encoding='utf-8') as f:
                content = f.read()
                
                
                target = filePath.split('\')[-2]
                targetList.append(target)
                tokenList.append(processing(content))
    # 划分训练集测试集
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import GaussianNB,MultinomialNB
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import classification_report
    x_train,x_test,y_train,y_test = train_test_split(tokenList,targetList,test_size=0.3,stratify=targetList)
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(x_train)
    X_test = vectorizer.transform(x_test)
    #多项式朴素贝叶斯
    from sklearn.naive_bayes import MultinomialNB
    mnb = MultinomialNB()
    module = mnb.fit(X_train,y_train)
    y_predict = module.predict(X_test)#预测
    scores=cross_val_score(mnb,X_test,y_test,cv=5)
    print("Accuracy:%.3f"%scores.mean())
    print("classification_report:
    ",classification_report(y_predict,y_test))

    targetList.append(target)
    print(targetList[0:10])
    tokenList.append(processing(content))
    tokenList[0:10]

    # 统计测试集和预测集的各类新闻个数
    testCount = collections.Counter(y_test)
    predCount = collections.Counter(y_predict)
    print('实际:',testCount,'
    ', '预测', predCount)
    
    # 建立标签列表,实际结果列表,预测结果列表
    nameList = list(testCount.keys())
    testList = list(testCount.values())
    predictList = list(predCount.values())
    x = list(range(len(nameList)))
    print("新闻类别:",nameList,'
    ',"实际:",testList,'
    ',"预测:",predictList)

  • 相关阅读:
    spark 之 UDF的两种方式
    hive到hive数据迁移
    离线数据分析之 人物兴趣取向分析(2-3) 使用pyspark构建Kmeans/随机森林完成分类预测
    python 之sparkSQL连接hive
    hive面试题之 统计最近七天内连续登陆3天的用户数量
    离线数据分析之 人物兴趣取向分析(2-2)离线/实时项目架构|项目流程|数仓构建(进阶篇)
    离线数据分析之 人物兴趣取向分析(2-1)数据探索
    hive 和 hbase的联系
    linux常用命令
    centos8联网问题
  • 原文地址:https://www.cnblogs.com/fanfanfan/p/10113067.html
Copyright © 2020-2023  润新知