• kaggle-Digit Recognizer


    • 安装kaggle工具获取数据源(linux 环境)
    • 采用sklearn的KNeighborsClassifier训练数据
    • 通过K折交叉验证来选取K值是正确率更高

    1.安装kaggle,获取数据源

    pip install kaggle
    

    将数据下载到目录/data/data-test/digit_recognize/下

    cd /data/data-test/digit_recognize/
    kaggle competitions download -c digit-recognizer
    

    2.安装anaconda3作为python3环境,自带sklearn,pandas,numpy等常用工具包

    3.代码实现

    import pandas as pd
    from sklearn.model_selection import cross_val_score
    from sklearn.neighbors import KNeighborsClassifier
    import pickle
    
    
    # 文件路径
    project_path = '/data/data-test/digit_recognize/'
    clf_file = project_path + 'knn.pickle'
    
    
    def get_data_chunk(file_name):
        # 文件太大分块读取文件 9000万条
        reader = pd.read_csv(file_name, iterator=True)
        loop = True
        chunk_size = 100000
        chunks = []
        while loop:
            try:
                chunk = reader.get_chunk(chunk_size)
                chunks.append(chunk)
                print(len(chunks))
            except StopIteration:
                loop = False
                print("Iteration is stopped.")
        res = pd.concat(chunks, ignore_index=True)
        return res
    
    
    def save_clf(clf_s):
        clf_f = open(clf_file, 'wb')
        pickle.dump(clf_s, clf_f)
        clf_f.close()
    
    
    def get_clf():
        clf_f = open(clf_file, 'rb')
        res = pickle.load(clf_f)
        return res
    
    # 对测试数据集预测结果
    def predict():
        knn_clf = get_clf()
        test_data = get_data_chunk(project_path + "test.csv")
        res_data = knn_clf.predict(test_data)
        df = pd.DataFrame()
        df["imageId"] = test_data["imageId"]
        df["Label"] = res_data
        df.to_csv(project_path + 'res.csv', index=False)
    
    
    def train():
        train_data = get_data_chunk(project_path + "train.csv")
        print(train_data.info())
        print(train_data)
        train_lable = train_data['label']
        x = train_data.drop(columns=['label'])
    
        max = 0
        max_k = 5
    
        # k取值从5,15用K折交叉验证算出正确率分数
        for k in range(5, 15):
            clf = KNeighborsClassifier(n_neighbors=k)
            # cv为2折
            scores = cross_val_score(clf, x, train_lable, cv=2, scoring='accuracy')
            mean = scores.mean()
            print(k, mean)
            if mean > max:
                max_k = k
        print("maxK=", max_k)
        # 用max_k作为knn参数训练模型
        clf = KNeighborsClassifier(n_neighbors=max_k)
        clf.fit(x, train_lable)
        # 存储模型到pickle文件
        save_clf(clf)
      	
    if __name__ == '__main__':
        train()
        predict()
  • 相关阅读:
    SQL JOB
    Log4net配置
    教你怎么使用Windows7系统自带的备份与还原的方法
    在LINQ TO SQL 中使用MVC3中的DataAnnotations 【MetadataType】
    图片下载
    DOS的一些常用命令
    自动合并多个文件如js css等 可以增加效率
    利用$.getJSON() 跨域请求操作
    在razor中使用递归,巧用递归
    Use ASP.NET and DotNetZip to Create and Extract ZIP Files
  • 原文地址:https://www.cnblogs.com/fwdqxl/p/10124490.html
Copyright © 2020-2023  润新知