• DeepLearning to digit recognizer in kaggle


    DeepLearning to digit recongnizer in kaggle


             近期在看deeplearning,于是就找了kaggle上字符识别进行练习。这里我主要用两种工具箱进行求解。并比对两者的结果。

    两种工具箱各自是DeepLearningToolbox和caffe。

    DeeplearningToolbox源代码解析见:http://blog.csdn.net/lu597203933/article/details/46576017

    Caffe学习见:http://caffe.berkeleyvision.org/

    一:DeeplearningToolbox

             DeeplearningToolbox基于matlab,很的简单,读下源代码,对于了解卷积神经网络等过程很有帮助。

    这里我主要是对digit recongnizer给出的数据集进行预处理以使其适用于我们的deeplearningToolbox工具箱。主要包括两个.m文件,各自是predeal.m和cnntest.m文件。

    所须要做的就是改变addpath的路径,代码凝视很具体,大家自己看。

    代码

    predeal.m

    % use the deeplearnToolbox to solve the digit recongnizer in kaggle!
    clear;clc
    trainFile = 'train.csv';
    testFile = 'test.csv';
    fidId = fopen(trainFile);
    
    M = csvread(trainFile, 1);   % 读取csv文件除第一行以外的全部数据
    train_x = M(:, 2:end);    %第2列開始为数据data
    label = M(:,1)';  %第一列为标签
    label(label == 0) = 10;   % 不变为10 以下一句无法处理
    train_y = full(sparse(label, 1:size(train_x, 1), 1));   %将标签变成一个矩阵
    
    train_x = double(reshape(train_x',28,28,size(train_x, 1)))/255;  
    
    
    
    fidId = fopen('test.csv');     %% 处理预測的数据
    M = csvread(testFile, 1);   % 读取csv文件除第一行以外的全部数据
    test_x = double(reshape(M',28,28,size(M, 1)))/255;  
    clear fidId label testFile M testFile trainFile
    
    
    addpath D:DeepLearningDeepLearnToolbox-masterdata      %路径须要改下
    addpath D:DeepLearningDeepLearnToolbox-masterCNN
    addpath D:DeepLearningDeepLearnToolbox-masterutil
    
    rand('state',0)
    cnn.layers = {        %%% 设置各层feature maps个数及卷积模板大小等属性
        struct('type', 'i') %input layer
        struct('type', 'c', 'outputmaps', 6, 'kernelsize', 5) %convolution layer
        struct('type', 's', 'scale', 2) %sub sampling layer
        struct('type', 'c', 'outputmaps', 12, 'kernelsize', 5) %convolution layer
        struct('type', 's', 'scale', 2) %subsampling layer
    };
    
    opts.alpha = 0.01;   %迭代下降的速率
    opts.batchsize = 50;   %每次选择50个样本进行更新  随机梯度下降。每次仅仅选用50个样本进行更新
    opts.numepochs = 25;   %迭代次数
    cnn = cnnsetup(cnn, train_x, train_y);      %对各层參数进行初始化 包含权重和偏置
    cnn = cnntrain(cnn, train_x, train_y, opts);  %训练的过程,包含bp算法及迭代过程
    
    test_y = cnntest(cnn, test_x);      %对測试数据集进行測试
    test_y(test_y == 10) = 0;      %标签10 须要反转为0
    test_y = test_y';
    M = [(1:length(test_y))' test_y(:)];  
    csvwrite('test_y.csv', M);
    figure; plot(cnn.rL);
    

    cnntest.m

      function [test_y] = cnntest(net, x)
        %  feedforward
        net = cnnff(net, x);
        [~, test_y] = max(net.o);
    end
    

    结果:用deeplearningToolbox得到的结果并非非常好,仅仅有0.94586

    二:caffe to digit recongnizer

             尽管caffe自带了mnist对样例对字符进行处理。可是官网给出的数据是二进制的文件,得到的结果也仅仅是一个简单的准确率,所以不能无限制的套用。

    过程例如以下:

    1:将给定csv数据转变成lmdb格式

    这里我在mnist的目录下写了一个convert_data_to_lmdb.cpp的程序对数据进行处理:

    代码例如以下:

    #include <iostream>
    #include <string>
    #include <sstream>
    #include <gflags/gflags.h>
    
    
    #include "boost/scoped_ptr.hpp"
    #include "gflags/gflags.h"
    #include "glog/logging.h"
    
    #include "caffe/proto/caffe.pb.h"
    #include "caffe/util/db.hpp"
    #include "caffe/util/io.hpp"
    #include "caffe/util/rng.hpp"
    
    using namespace caffe;
    using namespace std;
    using std::pair;
    using boost::scoped_ptr;
    
    /* edited by Zack
     * argv[1] the input file, argv[2] the output file*/
    
    DEFINE_string(backend, "lmdb", "The backend for storing the result");  // get Flags_backend == lmdb
    
    int main(int argc, char **argv){
    	::google::InitGoogleLogging(argv[0]);
    
    	#ifndef GFLAGS_GFLAGS_H_
    	  namespace gflags = google;
    	#endif
    
    	if(argc < 3){
    		LOG(ERROR)<< "please check the input arguments!";
    		return 1;
    	}
    	ifstream infile(argv[1]);
    	if(!infile){
    		LOG(ERROR)<< "please check the input arguments!";
    		return 1;
    	}
    	string str;
    	int count = 0;
    	int rows = 28;
    	int cols = 28;
    	unsigned char *buffer = new  unsigned char[rows*cols];
    	stringstream ss;
    
    	Datum datum;             // this data structure store the data and label
    	datum.set_channels(1);    // the channels
    	datum.set_height(rows);    // rows
    	datum.set_width(cols);     // cols
    
    	scoped_ptr<db::DB> db(db::GetDB(FLAGS_backend));         // new DB object
    	db->Open(argv[2], db::NEW);                    // open the lmdb file to store the data
    	scoped_ptr<db::Transaction> txn(db->NewTransaction());   // new Transaction object to put and commit the data
    
    	const int kMaxKeyLength = 256;           // to save the key
    	char key_cstr[kMaxKeyLength];
    
    	bool flag= false;
    	while(getline(infile, str)){
    		if(flag == false){
    			flag = true;
    			continue;
    		}
    		int beg = 0;
    		int end = 0;
    		int str_index = 0;
    		//test  need to add this----------1
    		//datum.set_label(0);
    		while((end = str.find_first_of(',', beg)) != string::npos){
    			//cout << end << endl;
    			string dig_str = str.substr(beg, end - beg);
    			int pixes;
    			ss.clear();
    			ss << dig_str;
    			ss >> pixes;
    			// test need to delete this--------------2
    			if(beg == 0){
    				datum.set_label(pixes);
    				beg = ++ end;
    				continue;
    			}
    			buffer[str_index++] = (unsigned char)pixes;
    			beg = ++end;
    		}
    		string dig_str = str.substr(beg);
    		int pixes;
    		ss.clear();
    		ss << dig_str;
    		ss >> pixes;
    		buffer[str_index++] = (unsigned char)pixes;
    		datum.set_data(buffer, rows*cols);
    
    		int length = snprintf(key_cstr, kMaxKeyLength, "%08d", count);
    
    		    // Put in db
    		string out;
    		CHECK(datum.SerializeToString(&out));              // serialize to string
    		txn->Put(string(key_cstr, length), out);        // put it, both the key and value
    
    		if (++count % 1000 == 0) {       // to commit every 1000 iteration
    		  // Commit db
    		  txn->Commit();
    		  txn.reset(db->NewTransaction());
    		  LOG(ERROR) << "Processed " << count << " files.";
    		}
    
    	}
    	// write the last batch
    	  if (count % 1000 != 0) {            // commit the last batch
    		txn->Commit();
    		LOG(ERROR) << "Processed " << count << " files.";
    	  }
    
    	return 0;
    }
    

    然后我们运行make all –j8对代码进行编译。

    这样在build目录下就会生成对应的二进制文件了。

    如图:

     

    然后运行./build/examples/mnist/convert_data_to_lmdb.bin examples/mnist/kaggle/data/train.csvexamples/mnist/kaggle/mnist_train_lmdb --backend=lmdb

    就能够得到得到训练文件的lmdb格式文件了。对于測试test.csv,因为test.csv没有标签,所以须要对代码进行细微调整,2处调整已在上述代码中标注了。

    然后相同运行make all –j8,再运行./build/examples/mnist/convert_data_to_lmdb.bin examples/mnist/kaggle/data/test.csvexamples/mnist/kaggle/mnist_test_lmdb --backend=lmdb

    就能够得到所相应的測试数据的lmdb格式文件了。

    2:用训练数据进行训练得到model

    Caffe在训练model的时候,代码须要在每隔test_iter时间就要对測试数据集进行測试,因此我们这里能够用train.csv的前1000条数据制作一个交叉验证的数据集lmdb, 过程和上面一样。

             分别将mnist文件夹以下的lenet_solver.prototxt和lenet_train_test.prototxt复制到kaggle文件夹以下。并对相应的包括文件所在文件夹和相应的batch size进行改动。详细见:下载地址。

    然后运行./build/tools/caffe train –solver=examples/mnist/kaggle/lenet_solver.prototxt,这样就能够得到我们的lenet_iter_10000.caffemodel了。

    3:提取測试集prob层的特征。

             这里我们使用tools文件下的extract_features.cpp的源文件。可是该源文件产生的结果是lmdb的格式。因此我对源代码进行了改动例如以下:

    #include <stdio.h>  // for snprintf
    #include <string>
    #include <vector>
    #include <fstream>
    
    #include "boost/algorithm/string.hpp"
    #include "google/protobuf/text_format.h"
    
    #include "caffe/blob.hpp"
    #include "caffe/common.hpp"
    #include "caffe/net.hpp"
    #include "caffe/proto/caffe.pb.h"
    #include "caffe/util/db.hpp"
    #include "caffe/util/io.hpp"
    #include "caffe/vision_layers.hpp"
    
    using caffe::Blob;
    using caffe::Caffe;
    using caffe::Datum;
    using caffe::Net;
    using boost::shared_ptr;
    using std::string;
    namespace db = caffe::db;
    
    template<typename Dtype>
    int feature_extraction_pipeline(int argc, char** argv);
    
    int main(int argc, char** argv) {
      return feature_extraction_pipeline<float>(argc, argv);
    //  return feature_extraction_pipeline<double>(argc, argv);
    }
    
    template<typename Dtype>
    int feature_extraction_pipeline(int argc, char** argv) {
      ::google::InitGoogleLogging(argv[0]);
      const int num_required_args = 7;     /// the parameters must be not less 7
      if (argc < num_required_args) {
        LOG(ERROR)<<
        "This program takes in a trained network and an input data layer, and then"
        " extract features of the input data produced by the net.
    "
        "Usage: extract_features  pretrained_net_param"
        "  feature_extraction_proto_file  extract_feature_blob_name1[,name2,...]"
        "  save_feature_dataset_name1[,name2,...]  num_mini_batches  db_type"
        "  [CPU/GPU] [DEVICE_ID=0]
    "
        "Note: you can extract multiple features in one pass by specifying"
        " multiple feature blob names and dataset names seperated by ','."
        " The names cannot contain white space characters and the number of blobs"
        " and datasets must be equal.";
        return 1;
      }
      int arg_pos = num_required_args;     //the necessary nums of parameters
    
      arg_pos = num_required_args;
      if (argc > arg_pos && strcmp(argv[arg_pos], "GPU") == 0) {          // whether use GPU------ -gpu 0
        LOG(ERROR)<< "Using GPU";
        uint device_id = 0;
        if (argc > arg_pos + 1) {
          device_id = atoi(argv[arg_pos + 1]);
          CHECK_GE(device_id, 0);
        }
        LOG(ERROR) << "Using Device_id=" << device_id;
        Caffe::SetDevice(device_id);
        Caffe::set_mode(Caffe::GPU);
      } else {
        LOG(ERROR) << "Using CPU";
        Caffe::set_mode(Caffe::CPU);
      }
    
      arg_pos = 0;  // the name of the executable
      std::string pretrained_binary_proto(argv[++arg_pos]);      // the mode had been trained
    
      // Expected prototxt contains at least one data layer such as
      //  the layer data_layer_name and one feature blob such as the
      //  fc7 top blob to extract features.
      /*
       layers {
         name: "data_layer_name"
         type: DATA
         data_param {
           source: "/path/to/your/images/to/extract/feature/images_leveldb"
           mean_file: "/path/to/your/image_mean.binaryproto"
           batch_size: 128
           crop_size: 227
           mirror: false
         }
         top: "data_blob_name"
         top: "label_blob_name"
       }
       layers {
         name: "drop7"
         type: DROPOUT
         dropout_param {
           dropout_ratio: 0.5
         }
         bottom: "fc7"
         top: "fc7"
       }
       */
      std::string feature_extraction_proto(argv[++arg_pos]);    // get the net structure
      shared_ptr<Net<Dtype> > feature_extraction_net(
          new Net<Dtype>(feature_extraction_proto, caffe::TEST));               //new net object  and set each layers------feature_extraction_net
      feature_extraction_net->CopyTrainedLayersFrom(pretrained_binary_proto);           // init the weights
    
      std::string extract_feature_blob_names(argv[++arg_pos]);          //exact which blob's feature
      std::vector<std::string> blob_names;
      boost::split(blob_names, extract_feature_blob_names, boost::is_any_of(","));   //you can exact many blobs' features and to store them in different dirname
    
      std::string save_feature_dataset_names(argv[++arg_pos]);   // to store the features
      std::vector<std::string> dataset_names;
      boost::split(dataset_names, save_feature_dataset_names,         // each dataset_names to store one blob's feature
                   boost::is_any_of(","));
      CHECK_EQ(blob_names.size(), dataset_names.size()) <<
          " the number of blob names and dataset names must be equal";
      size_t num_features = blob_names.size();     // how many features you exact
    
      for (size_t i = 0; i < num_features; i++) {
        CHECK(feature_extraction_net->has_blob(blob_names[i]))
            << "Unknown feature blob name " << blob_names[i]
            << " in the network " << feature_extraction_proto;
      }
    
      int num_mini_batches = atoi(argv[++arg_pos]);            // each exact num_mini_batches of images
    
      // init the DB and Transaction for all blobs you want to extract features
      std::vector<shared_ptr<db::DB> > feature_dbs;               // new DB object, is a vector  maybe has many blogs' feature
      std::vector<shared_ptr<db::Transaction> > txns;            // new Transaction object, is a vectore maybe has many blob's feature
    
    
      // edit by Zack
       //std::string strfile = "/home/hadoop/caffe/textileImage/features/probTest";
      std::string strfile = argv[argc-1];
      std::vector<std::ofstream*> vec(num_features, 0);
    
      const char* db_type = argv[++arg_pos];                  //the data to store style == lmdb
      for (size_t i = 0; i < num_features; ++i) {
        LOG(INFO)<< "Opening dataset " << dataset_names[i];               // dataset_name[i] to store the feature which type is lmdb
        shared_ptr<db::DB> db(db::GetDB(db_type));             // the type of the db
        db->Open(dataset_names.at(i), db::NEW);          // open the dir to store the feature
        feature_dbs.push_back(db);             // put the db to the vector
        shared_ptr<db::Transaction> txn(db->NewTransaction());     // the transaction to the db
        txns.push_back(txn);                // put the transaction to the vector
    
    // edit by Zack
    
        std::stringstream ss;
        ss.clear();
        string index;
        ss << i;
        ss >> index;
        std::string str = strfile + index + ".txt";
        vec[i] = new std::ofstream(str.c_str());
      }
    
      LOG(ERROR)<< "Extacting Features";
    
      Datum datum;
      const int kMaxKeyStrLength = 100;
      char key_str[kMaxKeyStrLength];      // to store the key
      std::vector<Blob<float>*> input_vec;
      std::vector<int> image_indices(num_features, 0);   /// how many blogs' feature you exact
    
    
      for (int batch_index = 0; batch_index < num_mini_batches; ++batch_index) {
        feature_extraction_net->Forward(input_vec);
        for (int i = 0; i < num_features; ++i) {    // to exact the blobs' name  maybe fc7 fc8
          const shared_ptr<Blob<Dtype> > feature_blob = feature_extraction_net
              ->blob_by_name(blob_names[i]);
          int batch_size = feature_blob->num();     // the nums of images-------batch size
          int dim_features = feature_blob->count() / batch_size;    // this dim of this feature of each image in this blob
          const Dtype* feature_blob_data;   // float is the features
          for (int n = 0; n < batch_size; ++n) {
            datum.set_height(feature_blob->height());     // set the height
            datum.set_width(feature_blob->width());     // set the width
            datum.set_channels(feature_blob->channels());    // set the channel
            datum.clear_data();               // clear data
            datum.clear_float_data();        // clear float_data
            feature_blob_data = feature_blob->cpu_data() +
                feature_blob->offset(n);    //the features of  which image
            for (int d = 0; d < dim_features; ++d) {
              datum.add_float_data(feature_blob_data[d]);
              (*vec[i]) << feature_blob_data[d] << " ";          // save the features
            }
            (*vec[i]) << std::endl;
            //LOG(ERROR)<< "dim" << dim_features;
            int length = snprintf(key_str, kMaxKeyStrLength, "%010d",
                image_indices[i]);       // key  di ji ge tupian
            string out;
            CHECK(datum.SerializeToString(&out));    // serialize to string
            txns.at(i)->Put(std::string(key_str, length), out);       // put to transaction
            ++image_indices[i];       // key++
            if (image_indices[i] % 1000 == 0) {    // when it reach to 1000 ,we commit it
              txns.at(i)->Commit();
              txns.at(i).reset(feature_dbs.at(i)->NewTransaction());
              LOG(ERROR)<< "Extracted features of " << image_indices[i] <<
                  " query images for feature blob " << blob_names[i];
            }
          }  // for (int n = 0; n < batch_size; ++n)
        }  // for (int i = 0; i < num_features; ++i)
      }  // for (int batch_index = 0; batch_index < num_mini_batches; ++batch_index)
      // write the last batch
      for (int i = 0; i < num_features; ++i) {
        if (image_indices[i] % 1000 != 0) {     // commit the last path images
          txns.at(i)->Commit();
        }
        // edit by Zack
          vec[i]->close();
          delete vec[i];
    
        LOG(ERROR)<< "Extracted features of " << image_indices[i] <<
            " query images for feature blob " << blob_names[i];
        feature_dbs.at(i)->Close();
      }
    
      LOG(ERROR)<< "Successfully extracted the features!";
      return 0;
    }
    

    最后将得到的prob层(即最后得到的概率)存入到了txt中了。

    此外对网络结构进行了调整,仅仅须要预測,网络中的參数都能够去掉不要了。,

    deploy.prototxt代码例如以下:

    name: "LeNet"
    layer {
      name: "mnist"
      type: "Data"
      top: "data"
      top: "label"
      transform_param {
        scale: 0.00390625
      }
      data_param {
        source: "examples/mnist/kaggle/mnist_test_lmdb"
        batch_size: 100
        backend: LMDB
      }
    }
    
    layer {
      name: "conv1"
      type: "Convolution"
      bottom: "data"
      top: "conv1"
     
      convolution_param {
        num_output: 20
        kernel_size: 5
        stride: 1
       
      }
    }
    layer {
      name: "pool1"
      type: "Pooling"
      bottom: "conv1"
      top: "pool1"
      pooling_param {
        pool: MAX
        kernel_size: 2
        stride: 2
      }
    }
    layer {
      name: "conv2"
      type: "Convolution"
      bottom: "pool1"
      top: "conv2"
    
      convolution_param {
        num_output: 50
        kernel_size: 5
        stride: 1
       
      }
    }
    layer {
      name: "pool2"
      type: "Pooling"
      bottom: "conv2"
      top: "pool2"
      pooling_param {
        pool: MAX
        kernel_size: 2
        stride: 2
      }
    }
    layer {
      name: "ip1"
      type: "InnerProduct"
      bottom: "pool2"
      top: "ip1"
      
      inner_product_param {
        num_output: 500
        
      }
    }
    layer {
      name: "relu1"
      type: "ReLU"
      bottom: "ip1"
      top: "ip1"
    }
    layer {
      name: "ip2"
      type: "InnerProduct"
      bottom: "ip1"
      top: "ip2"
    
      inner_product_param {
        num_output: 10
       
      }
    }
    layer {
      name: "prob"
      type: "Softmax"
      bottom: "ip2"
      top: "prob"
    }
    layer {
      name: "accuracy"
      type: "Accuracy"
      bottom: "prob"
      bottom: "label"
      top: "accuracy"
    }
    layer {
      name: "loss"
      type: "SoftmaxWithLoss"
      bottom: "ip2"
      bottom: "label"
      top: "loss"
    }
    

    然后运行

    ./build/tools/extract_features.bin examples/mnist/kaggle/lenet_iter_10000.caffemodel examples/mnist/kaggle/deploy.prototxt prob examples/mnist/kaggle/features 280 lmdb /home/hadoop/caffe/caffe-master/examples/mnist/kaggle/feature

    当中280为迭代次数,由于在deploy.prototxt中batch_size设为了100。故就为总共的測试数据集的大小=28000. /home/hadoop/caffe/caffe-master/examples/mnist/kaggle/feature为终于的提取特征存放在txt保存的路径。examples/mnist/kaggle/lenet_iter_10000.caffemodel为训练的权重參数,examples/mnist/kaggle/deploy.prototxt为网络结构。

    4:对得到的txt进行后处理

    通过上面三个步骤,我们就能够得到feture0.txt。存放的数据位28000*10大小。相应每一个样本属于哪一类发生的概率。然后运行下面matlab代码就能够得到kaggle所须要的提交结果了。最后的准确率为0.98986。排名也提升了400+。great!!

    % caffe toolbox, the postprocessing of the data 
    clear;clc;
    feature = load('feature0.txt');
    feature = feature';
    [~,test_y] = max(feature);
    [M,N] = size(test_y);
    test_y = test_y - repmat([1], M, N);
    test_y = test_y';
    M = [(1:length(test_y))' test_y(:)];  
    csvwrite('test_y3.csv', M);


    全部文件代码下载见:https://github.com/zack6514/zackcoding

  • 相关阅读:
    【转】linux常用命令
    【转】C++三大特性
    插入排序
    shixi
    【转】TCP协议
    【转】排序算法稳定性
    面筋BD
    斐波那契数列
    【面试题】D
    【学习笔记】OI模板整理
  • 原文地址:https://www.cnblogs.com/wzzkaifa/p/7086848.html
Copyright © 2020-2023  润新知