• daal4py 随机森林模型训练mnist并保存模型给C++ daal predict使用


    # daal4py Decision Forest Classification Training example Serialization
    
    import daal4py as d4p
    import numpy as np
    import pickle
    from sklearn.datasets import fetch_mldata
    from sklearn.model_selection import train_test_split
    
    def get_mnist():
        mnist = fetch_mldata('MNIST original')
        X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, train_size=60000, test_size=10000)
        data   = np.ascontiguousarray(X_train, dtype=np.float32)
        labels = np.ascontiguousarray(y_train, dtype=np.float32).reshape(y_train.shape[0],1)
    
        return data, labels
    
    # serialized model can be used only by daal4py with pickle
    def pickle_serialization(result, file='df_result.pkl'):
        with open(file,'wb') as out:
            pickle.dump(result, out)
    
    # universal naitive DAAL model serializtion. Can be used in all DAAL interfaces C++/Java/pydaal/daal4py
    def native_serialization(result, file='native_result.txt'):
        daal_buff = result.__getstate__()
        File = open(file, "wb")
        File.write(daal_buff)
    
    
    if __name__ == "__main__":
        data, labels = get_mnist()
    
        # 'fptype' parameter should be the same type as input numpy arrays to archive the best performance
        # (no data conversation in this case)
        train = d4p.decision_forest_classification_training(10, fptype='float', nTrees=100, minObservationsInLeafNode=1,
                                                            engine = d4p.engines_mt19937(seed=777),bootstrap=True)
        result = train.compute(data, labels)
    
        # serialize model to file
        pickle_serialization(result)
        native_serialization(result)
    

      

    python预测

    import daal4py as d4p
    
    import numpy as np
    import pickle
    from sklearn.datasets import fetch_mldata
    from sklearn.model_selection import train_test_split
    
    def get_mnist_test():
        mnist = fetch_mldata('MNIST original')
        X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, train_size=60000, test_size=10000)
        pdata   = np.ascontiguousarray(X_test, dtype=np.float32)
        plabels = np.ascontiguousarray(y_test, dtype=np.float32).reshape(y_test.shape[0],1)
    
        return pdata, plabels
    
    def checkAccuracy(plabels, prediction):
        t = 0
        count = 0
        for i in plabels:
            if i != prediction[t]:
                count = count + 1
            t = t + 1
        return (1 - count/t)
    
    def pickle_deserialization(file='df_result.pkl'):
        with open(file,'rb') as inp:
            return pickle.load(inp)
    
    def native_deserialization(file='native_result.txt'):
        daal_result = d4p.decision_forest_classification_training_result()
        File = open(file, "rb")
        daal_buff = File.read()
        daal_result.__setstate__(daal_buff)
        return daal_result
    
    if __name__ == "__main__":
        nClasses = 10
    
        pdata, plabels = get_mnist_test()
    
        #deserialize model
        deserialized_result_pickle = pickle_deserialization()
    
        deserialized_result_naitive = native_deserialization()
        
        # now predict using the deserialized model from the training above, fptype is float as input data
        predict_algo = d4p.decision_forest_classification_prediction(nClasses, fptype='float')
    
        # just set pickle-obtained model into compute
        predict_result = predict_algo.compute(pdata, deserialized_result_pickle.model)   
    
        print("
    Accuracy:", checkAccuracy(plabels, predict_result.prediction))
    
        # the same result as above. just set native-obtained model into compute
        predict_result = predict_algo.compute(pdata, deserialized_result_naitive.model)   
    
        print("
    Accuracy:", checkAccuracy(plabels, predict_result.prediction))
    

    c++使用该daal4py的模型:  

    /**
     * <a name="DAAL-EXAMPLE-CPP-DF_CLS_DENSE_BATCH"></a>
     * example df_cls_dense_batch.cpp
     */
    
    #include "daal.h"
    #include "service.h"
    #include "stdio.h"
    using namespace std;
    using namespace daal;
    using namespace daal::algorithms;
    using namespace daal::algorithms::decision_forest::classification;
    
    /* Input data set parameters */
    const string testDatasetFileName  = "../data/batch/mnist_test_data.csv";
    const string labels  = "../data/batch/mnist_test_labels.csv";
    
    const size_t nFeatures  = 784;  /* Number of features in training and testing data sets */
    const size_t nClasses = 10;  /* Number of classes */
    
    void testModel();
    void loadData(const std::string& dataFileName, const std::string& labelsFileName, NumericTablePtr& pData, NumericTablePtr& pDependentVar);
    void check_accuracy(NumericTablePtr prediction, NumericTablePtr testGroundTruth);
    
    int main(int argc, char *argv[])
    {
        checkArguments(argc, argv, 2, &labels, &testDatasetFileName);
    
        /* Deserialization */
        size_t size = 0;
        byte * buffer = NULL;
        FILE * pFile;
        size_t result;
        
        pFile = fopen ( "../data/batch/native_result.txt" , "rb" );
        if (pFile==NULL)
        {
            fputs ("File error",stderr);
            exit (1);
        }
        
        // obtain file size:
        fseek (pFile , 0 , SEEK_END);
        size = ftell (pFile);
        std::cout << "size: " << size << "
    ";
        rewind(pFile);
        
        // allocate memory to contain the whole file:
        buffer = (byte*) malloc (sizeof(byte)*size);
        if (buffer == NULL)
        {
            fputs ("Memory error",stderr); 
            exit (2);
        }
        
        // copy the file into the buffer:
        result = fread (buffer,1,size,pFile);
        if (result != size)
        {
            fputs ("Reading error",stderr);
            exit (3);
        }
        /* the result buffer is now loaded in the buffer. */
    
        /* Create a data archive to deserialize the numeric table */
        OutputDataArchive out_dataArch(buffer, size);
        free (buffer);
        fclose (pFile);
    
        /* needed for result allocation */
        training::Batch<> train(nClasses);
        train.getResult()->deserialize(out_dataArch);
    
        /* Create Numeric Tables for testing data and ground truth values */
        NumericTablePtr testData;
        NumericTablePtr testGroundTruth;
    
        loadData(testDatasetFileName, labels, testData, testGroundTruth);
        /* Create an algorithm object to predict values of decision forest classification */
        prediction::Batch<> algorithm(nClasses);
    
        /* Pass a testing data set and the trained model to the algorithm */
        algorithm.input.set(classifier::prediction::data, testData);
        /* set deserialized model */
        algorithm.input.set(classifier::prediction::model, train.getResult()->get(classifier::training::model));
    
        /* Predict values of decision forest classification */
        algorithm.compute();
    
        /* Retrieve the algorithm results */
        NumericTablePtr prediction = algorithm.getResult()->get(classifier::prediction::prediction); 
        printNumericTable(prediction, "Prediction results (first 10 rows):", 10);
        printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10);
    
        check_accuracy(prediction, testGroundTruth);
        
        return 0;
    }
    
    void check_accuracy(NumericTablePtr prediction, NumericTablePtr testGroundTruth)
    {
        /* check accuracy */
        BlockDescriptor<double> blockPr;
        prediction->getBlockOfRows(0, prediction->getNumberOfRows(), readOnly, blockPr);
        
        double* valueP = (blockPr.getBlockPtr());
    
        BlockDescriptor<double> blockGT;
        testGroundTruth->getBlockOfRows(0, testGroundTruth->getNumberOfRows(), readOnly, blockGT);
        
        double* valueG = (blockGT.getBlockPtr());
    
        size_t count = 0;
        for(size_t i = 0; i < testGroundTruth->getNumberOfRows(); i++)
        {
            if(valueG[i] != valueP[i])
                count++;
        }
        testGroundTruth->releaseBlockOfRows(blockGT);
        prediction->releaseBlockOfRows(blockPr);
        cout << "accuracy: " << 1- double(count)/double(testGroundTruth->getNumberOfRows()) << "
    ";
    }
    
    void loadData(const std::string& dataFileName,const std::string& labelsFileName, NumericTablePtr& pData, NumericTablePtr& pDependentVar)
    {
        /* Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file */
        FileDataSource<CSVFeatureManager> trainDataSource(dataFileName,
            DataSource::notAllocateNumericTable,
            DataSource::doDictionaryFromContext);
    
        FileDataSource<CSVFeatureManager> trainLabels(labelsFileName,
            DataSource::notAllocateNumericTable,
            DataSource::doDictionaryFromContext);
    
        /* Create Numeric Tables for training data and dependent variables */
        pData.reset(new HomogenNumericTable<>(nFeatures, 0, NumericTable::notAllocate));
        pDependentVar.reset(new HomogenNumericTable<>(1, 0, NumericTable::notAllocate));
    
        /* Retrieve the data from input file */
        trainDataSource.loadDataBlock(pData.get());
        trainLabels.loadDataBlock(pDependentVar.get());
        NumericTableDictionaryPtr pDictionary = pData->getDictionarySharedPtr();
    }
    

      

  • 相关阅读:
    array.prototype.slice.call(arguments)
    axios 的坑
    Cannot read property 'range' of null
    IDEA导入Eclipse的JavaEE项目详细步骤链接
    使用 yarn 安装时,报错node_modules ode sass:Command failed.
    axios post请求非json传参设置
    vue-cli 3.x跨域配置
    idea错误: 找不到或无法加载主类
    Git 上传新项目
    Windows下生成SSH密钥
  • 原文地址:https://www.cnblogs.com/bonelee/p/10044911.html
Copyright © 2020-2023  润新知