• OpenCV3 SVM ANN Adaboost KNN 随机森林等机器学习方法对OCR分类


    转摘自http://www.cnblogs.com/denny402/p/5032839.html

    opencv3中的ml类与opencv2中发生了变化,下面列举opencv3的机器学习类方法实例:

    用途是opencv自带的ocr样本的分类功能,其中神经网络和adaboost训练速度很慢,效果还是knn的最好;

      1 #include <opencv2/opencv.hpp>
      2 #include <iostream>
      3 using namespace std;
      4 using namespace cv;
      5 using namespace cv::ml;
      6 
      7 // 读取文件数据
      8 bool read_num_class_data(const string& filename, int var_count, Mat* _data, Mat* _responses)
      9 {
     10     const int M = 1024;
     11     char buf[M + 2];
     12 
     13     Mat el_ptr(1, var_count, CV_32F);
     14     int i;
     15     vector<int> responses;
     16 
     17     _data->release();
     18     _responses->release();
     19     FILE *f;
     20     fopen_s(&f, filename.c_str(), "rt");
     21     if (!f)
     22     {
     23         cout << "Could not read the database " << filename << endl;
     24         return false;
     25     }
     26 
     27     for (;;)
     28     {
     29         char* ptr;
     30         if (!fgets(buf, M, f) || !strchr(buf, ','))
     31             break;
     32         responses.push_back((int)buf[0]);
     33         ptr = buf + 2;
     34         for (i = 0; i < var_count; i++)
     35         {
     36             int n = 0;
     37             sscanf_s(ptr, "%f%n", &el_ptr.at<float>(i), &n);
     38             ptr += n + 1;
     39         }
     40         if (i < var_count)
     41             break;
     42         _data->push_back(el_ptr);
     43     }
     44     fclose(f);
     45     Mat(responses).copyTo(*_responses);
     46     return true;
     47 }
     48 
     49 
     50 //准备训练数据
     51 Ptr<TrainData> prepare_train_data(const Mat& data, const Mat& responses, int ntrain_samples)
     52 {
     53     Mat sample_idx = Mat::zeros(1, data.rows, CV_8U);
     54     Mat train_samples = sample_idx.colRange(0, ntrain_samples);
     55     train_samples.setTo(Scalar::all(1));
     56 
     57     int nvars = data.cols;
     58     Mat var_type(nvars + 1, 1, CV_8U);
     59     var_type.setTo(Scalar::all(VAR_ORDERED));
     60     var_type.at<uchar>(nvars) = VAR_CATEGORICAL;
     61 
     62     return TrainData::create(data, ROW_SAMPLE, responses,
     63         noArray(), sample_idx, noArray(), var_type);
     64 }
     65 
     66 //设置迭代条件
     67 inline TermCriteria TC(int iters, double eps)
     68 {
     69     return TermCriteria(TermCriteria::MAX_ITER + (eps > 0 ? TermCriteria::EPS : 0), iters, eps);
     70 }
     71 
     72 //分类预测
     73 void test_and_save_classifier(const Ptr<StatModel>& model, const Mat& data, const Mat& responses,
     74     int ntrain_samples, int rdelta)
     75 {
     76     int i, nsamples_all = data.rows;
     77     double train_hr = 0, test_hr = 0;
     78 
     79     // compute prediction error on train and test data
     80     for (i = 0; i < nsamples_all; i++)
     81     {
     82         Mat sample = data.row(i);
     83 
     84         float r = model->predict(sample);
     85         r = std::abs(r + rdelta - responses.at<int>(i)) <= FLT_EPSILON ? 1.f : 0.f;
     86 
     87         if (i < ntrain_samples)
     88             train_hr += r;
     89         else
     90             test_hr += r;
     91     }
     92 
     93     test_hr /= nsamples_all - ntrain_samples;
     94     train_hr = ntrain_samples > 0 ? train_hr / ntrain_samples : 1.;
     95 
     96     printf("Recognition rate: train = %.1f%%, test = %.1f%%
    ",
     97         train_hr*100., test_hr*100.);
     98 }
     99 
    100 //随机树分类
    101 bool build_rtrees_classifier(const string& data_filename)
    102 {
    103     Mat data;
    104     Mat responses;
    105     read_num_class_data(data_filename, 16, &data, &responses);
    106 
    107     int nsamples_all = data.rows;
    108     int ntrain_samples = (int)(nsamples_all*0.8);
    109 
    110     Ptr<RTrees> model;
    111     Ptr<TrainData> tdata = prepare_train_data(data, responses, ntrain_samples);
    112     model = RTrees::create();
    113     model->setMaxDepth(10);
    114     model->setMinSampleCount(10);
    115     model->setRegressionAccuracy(0);
    116     model->setUseSurrogates(false);
    117     model->setMaxCategories(15);
    118     model->setPriors(Mat());
    119     model->setCalculateVarImportance(true);
    120     model->setActiveVarCount(4);
    121     model->setTermCriteria(TC(100, 0.01f));
    122     model->train(tdata);
    123     test_and_save_classifier(model, data, responses, ntrain_samples, 0);
    124     cout << "Number of trees: " << model->getRoots().size() << endl;
    125 
    126     // Print variable importance
    127     Mat var_importance = model->getVarImportance();
    128     if (!var_importance.empty())
    129     {
    130         double rt_imp_sum = sum(var_importance)[0];
    131         printf("var#	importance (in %%):
    ");
    132         int i, n = (int)var_importance.total();
    133         for (i = 0; i < n; i++)
    134             printf("%-2d	%-4.1f
    ", i, 100.f*var_importance.at<float>(i) / rt_imp_sum);
    135     }
    136 
    137     return true;
    138 }
    139 
    140 //adaboost分类
    141 bool build_boost_classifier(const string& data_filename)
    142 {
    143     const int class_count = 26;
    144     Mat data;
    145     Mat responses;
    146     Mat weak_responses;
    147 
    148     read_num_class_data(data_filename, 16, &data, &responses);
    149     int i, j, k;
    150     Ptr<Boost> model;
    151 
    152     int nsamples_all = data.rows;
    153     int ntrain_samples = (int)(nsamples_all*0.5);
    154     int var_count = data.cols;
    155 
    156     Mat new_data(ntrain_samples*class_count, var_count + 1, CV_32F);
    157     Mat new_responses(ntrain_samples*class_count, 1, CV_32S);
    158 
    159     for (i = 0; i < ntrain_samples; i++)
    160     {
    161         const float* data_row = data.ptr<float>(i);
    162         for (j = 0; j < class_count; j++)
    163         {
    164             float* new_data_row = (float*)new_data.ptr<float>(i*class_count + j);
    165             memcpy(new_data_row, data_row, var_count * sizeof(data_row[0]));
    166             new_data_row[var_count] = (float)j;
    167             new_responses.at<int>(i*class_count + j) = responses.at<int>(i) == j + 'A';
    168         }
    169     }
    170 
    171     Mat var_type(1, var_count + 2, CV_8U);
    172     var_type.setTo(Scalar::all(VAR_ORDERED));
    173     var_type.at<uchar>(var_count) = var_type.at<uchar>(var_count + 1) = VAR_CATEGORICAL;
    174 
    175     Ptr<TrainData> tdata = TrainData::create(new_data, ROW_SAMPLE, new_responses,
    176         noArray(), noArray(), noArray(), var_type);
    177     vector<double> priors(2);
    178     priors[0] = 1;
    179     priors[1] = 26;
    180 
    181     model = Boost::create();
    182     model->setBoostType(Boost::GENTLE);
    183     model->setWeakCount(100);
    184     model->setWeightTrimRate(0.95);
    185     model->setMaxDepth(5);
    186     model->setUseSurrogates(false);
    187     model->setPriors(Mat(priors));
    188     model->train(tdata);
    189     Mat temp_sample(1, var_count + 1, CV_32F);
    190     float* tptr = temp_sample.ptr<float>();
    191 
    192     // compute prediction error on train and test data
    193     double train_hr = 0, test_hr = 0;
    194     for (i = 0; i < nsamples_all; i++)
    195     {
    196         int best_class = 0;
    197         double max_sum = -DBL_MAX;
    198         const float* ptr = data.ptr<float>(i);
    199         for (k = 0; k < var_count; k++)
    200             tptr[k] = ptr[k];
    201 
    202         for (j = 0; j < class_count; j++)
    203         {
    204             tptr[var_count] = (float)j;
    205             float s = model->predict(temp_sample, noArray(), StatModel::RAW_OUTPUT);
    206             if (max_sum < s)
    207             {
    208                 max_sum = s;
    209                 best_class = j + 'A';
    210             }
    211         }
    212 
    213         double r = std::abs(best_class - responses.at<int>(i)) < FLT_EPSILON ? 1 : 0;
    214         if (i < ntrain_samples)
    215             train_hr += r;
    216         else
    217             test_hr += r;
    218     }
    219 
    220     test_hr /= nsamples_all - ntrain_samples;
    221     train_hr = ntrain_samples > 0 ? train_hr / ntrain_samples : 1.;
    222     printf("Recognition rate: train = %.1f%%, test = %.1f%%
    ",
    223         train_hr*100., test_hr*100.);
    224 
    225     cout << "Number of trees: " << model->getRoots().size() << endl;
    226     return true;
    227 }
    228 
    229 //多层感知机分类(ANN)
    230 bool build_mlp_classifier(const string& data_filename)
    231 {
    232     const int class_count = 26;
    233     Mat data;
    234     Mat responses;
    235 
    236     read_num_class_data(data_filename, 16, &data, &responses);
    237     Ptr<ANN_MLP> model;
    238 
    239     int nsamples_all = data.rows;
    240     int ntrain_samples = (int)(nsamples_all*0.8);
    241     Mat train_data = data.rowRange(0, ntrain_samples);
    242     Mat train_responses = Mat::zeros(ntrain_samples, class_count, CV_32F);
    243 
    244     // 1. unroll the responses
    245     cout << "Unrolling the responses...
    ";
    246     for (int i = 0; i < ntrain_samples; i++)
    247     {
    248         int cls_label = responses.at<int>(i) - 'A';
    249         train_responses.at<float>(i, cls_label) = 1.f;
    250     }
    251 
    252     // 2. train classifier
    253     int layer_sz[] = { data.cols, 100, 100, class_count };
    254     int nlayers = (int)(sizeof(layer_sz) / sizeof(layer_sz[0]));
    255     Mat layer_sizes(1, nlayers, CV_32S, layer_sz);
    256 
    257 #if 1
    258     int method = ANN_MLP::BACKPROP;
    259     double method_param = 0.001;
    260     int max_iter = 300;
    261 #else
    262     int method = ANN_MLP::RPROP;
    263     double method_param = 0.1;
    264     int max_iter = 1000;
    265 #endif
    266 
    267     Ptr<TrainData> tdata = TrainData::create(train_data, ROW_SAMPLE, train_responses);
    268     model = ANN_MLP::create();
    269     model->setLayerSizes(layer_sizes);
    270     model->setActivationFunction(ANN_MLP::SIGMOID_SYM, 0, 0);
    271     model->setTermCriteria(TC(max_iter, 0));
    272     model->setTrainMethod(method, method_param);
    273     model->train(tdata);
    274     return true;
    275 }
    276 
    277 //K最近邻分类
    278 bool build_knearest_classifier(const string& data_filename, int K)
    279 {
    280     Mat data;
    281     Mat responses;
    282     read_num_class_data(data_filename, 16, &data, &responses);
    283     int nsamples_all = data.rows;
    284     int ntrain_samples = (int)(nsamples_all*0.8);
    285 
    286     Ptr<TrainData> tdata = prepare_train_data(data, responses, ntrain_samples);
    287     Ptr<KNearest> model = KNearest::create();
    288     model->setDefaultK(K);
    289     model->setIsClassifier(true);
    290     model->train(tdata);
    291 
    292     test_and_save_classifier(model, data, responses, ntrain_samples, 0);
    293     return true;
    294 }
    295 
    296 //贝叶斯分类
    297 bool build_nbayes_classifier(const string& data_filename)
    298 {
    299     Mat data;
    300     Mat responses;
    301     read_num_class_data(data_filename, 16, &data, &responses);
    302 
    303     int nsamples_all = data.rows;
    304     int ntrain_samples = (int)(nsamples_all*0.8);
    305 
    306     Ptr<NormalBayesClassifier> model;
    307     Ptr<TrainData> tdata = prepare_train_data(data, responses, ntrain_samples);
    308     model = NormalBayesClassifier::create();
    309     model->train(tdata);
    310 
    311     test_and_save_classifier(model, data, responses, ntrain_samples, 0);
    312     return true;
    313 }
    314 
    315 
    316 //svm分类
    317 bool build_svm_classifier(const string& data_filename)
    318 {
    319     Mat data;
    320     Mat responses;
    321     read_num_class_data(data_filename, 16, &data, &responses);
    322 
    323     int nsamples_all = data.rows;
    324     int ntrain_samples = (int)(nsamples_all*0.8);
    325 
    326     Ptr<SVM> model;
    327     Ptr<TrainData> tdata = prepare_train_data(data, responses, ntrain_samples);
    328     model = SVM::create();
    329     model->setType(SVM::C_SVC);
    330     model->setKernel(SVM::LINEAR);
    331     model->setC(1);
    332     model->train(tdata);
    333 
    334     test_and_save_classifier(model, data, responses, ntrain_samples, 0);
    335     return true;
    336 }
    337 
    338 int main()
    339 {
    340     string data_filename = "D:\Program Files\opencv\sources\samples\data\letter-recognition.data";  //字母数据
    341 
    342     cout << "svm分类:" << endl;
    343     build_svm_classifier(data_filename);
    344 
    345     cout << "贝叶斯分类:" << endl;
    346     build_nbayes_classifier(data_filename);
    347 
    348     cout << "K最近邻分类:" << endl;
    349     build_knearest_classifier(data_filename, 10);
    350 
    351     cout << "随机树分类:" << endl;
    352     build_rtrees_classifier(data_filename);
    353 
    354     cout << "adaboost分类:" << endl;
    355     build_boost_classifier(data_filename);
    356 
    357     cout << "ANN(多层感知机)分类:" << endl;
    358     build_mlp_classifier(data_filename);
    359 
    360     system("pause");
    361     return 0;
    362 }
  • 相关阅读:
    虚拟机linux下git clone 报SSL connect error错误
    TooManyRedirects错误
    windows2008 使用 opencv_python 出现 DLL load failed错误
    禁止别人通过开发人员工具查看网站代码
    pipreqs 执行报错问题
    Vue-router 报NavigationDuplicated的解决方案
    git 记住用户密码
    获取python所有依赖包
    修改pip的安装源
    使用pycharm发布python程序到ubuntu中运行
  • 原文地址:https://www.cnblogs.com/ggYYa/p/6952514.html
Copyright © 2020-2023  润新知