Intel daal数据预处理

https://software.intel.com/en-us/daal-programming-guide-datasource-featureextraction-py

# file: datasource_featureextraction.py
#===============================================================================
# Copyright 2014-2018 Intel Corporation.
#
# This software and the related documents are Intel copyrighted  materials,  and
# your use of  them is  governed by the  express license  under which  they were
# provided to you (License).  Unless the License provides otherwise, you may not
# use, modify, copy, publish, distribute,  disclose or transmit this software or
# the related documents without Intel's prior written permission.
#
# This software and the related documents  are provided as  is,  with no express
# or implied  warranties,  other  than those  that are  expressly stated  in the
# License.
#===============================================================================

#
# !  Content:
# !    Python example for using of data source feature extraction
# !*****************************************************************************

#
## <a name = "DAAL-EXAMPLE-PY-DATASOURCE_FEATUREEXTRACTION"></a>
## example datasource_featureextraction.py
#
import os
import sys

from daal.data_management import FileDataSource, DataSourceIface, ColumnFilter, OneHotEncoder

utils_folder = os.path.realpath(os.path.abspath(os.path.dirname(os.path.dirname(__file__))))
if utils_folder not in sys.path:
    sys.path.insert(0, utils_folder)
from utils import printNumericTable


# Input data set parameters
datasetFileName = "../data/batch/kmeans_dense.csv"

if __name__ == "__main__":

    # Initialize FileDataSource to retrieve the input data from a .csv file
    dataSource = FileDataSource(datasetFileName, DataSourceIface.doAllocateNumericTable)

    # Create data source dictionary from loading of the first .csv file
    dataSource.createDictionaryFromContext()

    # Filter in 3 chosen columns from a .csv file
    validList = [1, 2, 5]

    colFilter = ColumnFilter()
    filterList = colFilter.list(validList)
    dataSource.getFeatureManager().addModifier(filterList)

    # Consider column with index 1 as categorical and convert it into 3 binary categorical features
    dataSource.getFeatureManager().addModifier(OneHotEncoder(1, 3))

    # Load data from .csv file
    dataSource.loadDataBlock()

    # Print result
    table = dataSource.getNumericTable()
    printNumericTable(table, "Loaded data", 4, 20)

csv里的3行示例数据（***为期望获取的数据）：

-21.535651,14.132297,***31.235426***,99.000715,63.627557,***79.873729***,-72.628829,17.935287,12.955671,56.199663,-29.835004,-87.386819,-111.845621,72.467268,-46.623092,-52.977263,-6.172873,97.726714,-112.612707,-59.935622 
2.302152,-29.002080,***73.105128***,28.452499,57.450846,***87.025368***,-63.366150,13.260953,-34.590126,-12.709892,44.982761,8.761890,-23.771476,15.061654,74.016723,-67.233482,-15.883441,-56.213821,-29.666733,87.122775 
-58.761132,5.850924,***56.845222***,77.251095,101.058664,***-22.501586***,-46.134525,-83.836106,-64.732183,-92.188620,4.275683,35.571124,47.117437,35.720971,-102.055371,41.777812,32.327431,-90.137946,-62.958812,93.033859

输出结果：

1.000     0.000     0.000     31.235    79.874    
0.000     1.000     0.000     73.105    87.025    
0.000     0.000     1.000     56.845    -22.502   
0.000     0.000     0.000     33.922    79.370

OneHotEncoder这里没有懂啊！！！

printNumericTable的实现代码：printNumericTable(data_table, message='', num_printed_rows=0, num_printed_cols=0, interval=10)

def printNumericTable(data_table, message='', num_printed_rows=0, num_printed_cols=0,
                      interval=10):
    num_rows = data_table.getNumberOfRows()
    num_cols = data_table.getNumberOfColumns()
    layout = data_table.getDataLayout()

    if num_printed_rows != 0:
        num_printed_rows = min(num_rows, num_printed_rows)
    else:
        num_printed_rows = num_rows

    if num_printed_cols != 0:
        num_printed_cols = min(num_cols, num_printed_cols)
    else:
        num_printed_cols = num_cols

    block = BlockDescriptor()
    if isFull(layout) or layout == NumericTableIface.csrArray:
        data_table.getBlockOfRows(0, num_rows, readOnly, block)
        printArray(block.getArray(), num_printed_cols, num_printed_rows,
                   num_cols, message, interval)
        data_table.releaseBlockOfRows(block)
    else:
        packed_table = data_table.getBlockOfRowsAsDouble(0, num_rows)

        if isLower(layout):
            printLowerArray(packed_table, num_printed_rows, message, interval)
        elif isUpper(layout):
            printUpperArray(packed_table, num_printed_cols, num_printed_rows,
                            num_cols, message, interval)

追踪下getFeatureManager里面有哪些东西。

~/anaconda3/envs/intelpy# ack getFeatureManager .
share/pydaal_examples/examples/python/source/datasource/datasource_featureextraction.py
52:    dataSource.getFeatureManager().addModifier(filterList)
55:    dataSource.getFeatureManager().addModifier(OneHotEncoder(1, 3))

lib/python3.6/site-packages/daal/data_management/db.py
213:    def getFeatureManager(self):
214:        return _db.ODBCDataSource_MySQLFeatureManagerFloat64_getFeatureManager(self)

lib/python3.6/site-packages/daal/data_management/__init__.py
3939:    def getFeatureManager(self):
3940:        return _data_management.CsvDataSource_CSVFeatureManagerFloat64_getFeatureManager(self)

看到getFeatureManager最后调用的是CsvDataSource_CSVFeatureManagerFloat64_getFeatureManager

root@vultr:~/anaconda3/envs/intelpy# vi lib/python3.6/site-packages/daal/data_management/__init__.py

然后继续追踪：
root@vultr:~/anaconda3/envs/intelpy# ack CsvDataSource_CSVFeatureManagerFloat64_getFeatureManager .
lib/python3.6/site-packages/daal/data_management/__init__.py
3940:        return _data_management.CsvDataSource_CSVFeatureManagerFloat64_getFeatureManager(self)

最后发现是在动态链接库里！！！
root@vultr:~/anaconda3/envs/intelpy# grep CsvDataSource_CSVFeatureManagerFloat64_getFeatureManager . -r
Binary file ./lib/python3.6/site-packages/daal/data_management/_data_management.cpython-36m-x86_64-linux-gnu.so matches
Binary file ./lib/python3.6/site-packages/daal/data_management/__pycache__/__init__.cpython-36.pyc matches
./lib/python3.6/site-packages/daal/data_management/__init__.py:        return _data_management.CsvDataSource_CSVFeatureManagerFloat64_getFeatureManager(self)

看来要知道getFeatureManager里的方法，只有使用help和dir了。

可以看到type（dataSource.getFeatureManager()）的结果：

再看看dir的结果：
['__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__swig_destroy__', '__swig_getmethods__', '__swig_setmethods__', '__weakref__', '_s', 'addModifier', 'finalize', 'getNumericTableNumberOfColumns', 'parseRowAsDictionary', 'parseRowAsHeader', 'parseRowIn', 'setDelimiter', 'setFeatureDetailsFromDictionary', 'this']

help文档结果：

Help on CSVFeatureManager in module daal.data_management object:

class CSVFeatureManager(StringRowFeatureManagerIface)
 |  Method resolution order:
 |      CSVFeatureManager
 |      StringRowFeatureManagerIface
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __del__ lambda self
 |  
 |  __getattr__ lambda self, name
 |  
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __repr__ = _swig_repr(self)
 |  
 |  __setattr__ lambda self, name, value
 |  
 |  __swig_destroy__ = delete_CSVFeatureManager(...)
 |  
 |  addModifier(self, *args)
 |  
 |  finalize(self, dictionary)
 |  
 |  getNumericTableNumberOfColumns(self)
 |  
 |  parseRowAsDictionary(self, rawRowData, dictionary)
 |  
 |  parseRowAsHeader(self, rawRowData)
 |  
 |  parseRowIn(self, rawRowData, dictionary, nt, ntRowIndex)
 |  
 |  setDelimiter(self, delimiter)
 |  
 |  setFeatureDetailsFromDictionary(self, dictionary)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __swig_getmethods__ = {}
 |  
 |  __swig_setmethods__ = {}
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from StringRowFeatureManagerIface:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)

不知道咋用这几个函数啊！！！

from：https://software.intel.com/en-us/daal-programming-guide-data-sources

Data Sources

Data sources define interfaces for access and management of data in raw format and out-of-memory data. A data source is closely coupled with the data dictionary that describes the structure of the data associated with the data source. To create the associated data dictionary, you can do one of the following:

While constructing a data source object, specify whether it should automatically create and initialize the associated data dictionary.
Call the createDictionaryFromContext() method.

The getDictionary() method returns the dictionary associated with the data source.

Data sources stream and transform raw out-of-memory data into numeric in-memory data accessible through numeric table interfaces. A data source is associated with the corresponding numeric table. To allocate the associated numeric table, you can do one of the following:

While constructing a data source object, specify whether it should automatically allocate the numeric table.
Call the allocateNumericTable() method.

The getNumericTable() method returns the numeric table associated with the data source.

To retrieve the number of columns (features) in a raw data set, use the getNumberOfColumns() method. To retrieve the number of rows (observations) available in a raw data set, use the getNumberOfAvailableRows() method. The getStatus() method returns the current status of the data source:

readyForLoad - the data is available for the load operation.
waitingForData - the data source is waiting for new data to arrive later; designated for data sources that deal with asynchronous data streaming, that is, the data arriving in blocks at different points in time.
endOfData- all the data is already loaded.

Because the entire out-of-memory data set may fail to fit into memory, as well as for performance reasons, Intel® DAAL implements data loading in blocks. Use the loadDataBlock() method to load the next block of data into the numeric table. This method enables you to load a data block into an internally allocated numeric table or into the provided numeric table. In both cases, you can specify the number of rows or not. The method also recalculates basic statistics associated with this numeric table.

Intel DAAL maintains the list of possible values associated with categorical features to convert them into a numeric form. In this list, a new index is assigned to each new value found in the raw data set. You can get the list of possible values from the possibleValues collection associated with the corresponding feature in the data source. In the case you have several data sets with same data structure and you want to use continuous indexing, do the following:

Retrieve the data dictionary from the last data source using the getDictionary() method.
Assign this dictionary to the next data source using the setDictionary() method.
Repeat these steps for each next data source.

Intel® DAAL data source
Intel DAAL implements classes for some popular types of data sources. Each of these classes takes a feature manager class as the class template parameter. The feature manager parses, filters, and normalizes the data and converts it into a numeric format. The following are the data sources and the corresponding feature manager classes:

Text file (FileDataSource class), to be used with the CSVFeatureManager class
ODBC (ODBCDataSource class), to be used with the MySQLFeatureManager class
In-memory text (StringDataSource class), to be used with the CSVFeatureManager class
KDB relational database (KDBDataSource class) [kdb], to be used with the KDBFeatureManager class

CSVFeatureManager provides additional capabilities for features modification. Use addModifier() to enable specific modification when loading data to a numeric table:

Add the ColumnFilter object if you need to have a predefined subset of features loaded
Add the OneHotEncoder object if you need a categorical feature to be encoded using the one-hot scheme

Feature managers provide additional capabilities for the modification of the input data during its loading. Use the Feature modifier entity to define desired modification. Feature modifiers enables you to implement a wide range of feature extraction or transformation techniques, for instance, feature binarization, one-hot-encoding, or polynomial features generation. To enable specific modification, use the addModifier() method that accepts two parameters:

featureIds - a subset of feature identifiers for which you want to apply modification.
featureModifier - an implementation of the Feature modifier, an object that implements the FeatureModifierIface interface and specifies the way how features of the input data set should be modified and written to the output numeric table.

Typical feature modifiers usage scenario is the following:

Create the data source object and specify a feature manager and its parameters.
Define a subset of features for modification and proper feature modifier.
Add modifier to the feature manager of the data source object.
Call loadDataBlock(), it causes data set loading and applying specified modification procedure to the features of the data set.

The code block bellow demonstrates feature modifiers usage scenario in case of FileDataSource and CSVFeatureManager.

// Crate DataSource object (for example FileDataSource)
FileDataSource<CSVFeatureManager> ds("file.csv", options);

// Specify features subset and modifier
auto featureIds = features::list("f1", "f2");
auto featureModifier = modifiers::csv::continuous();

// Add modifier to feature manager
ds.getFeatureManager().addModifier(featureIds, modifier);

// Cause data loading
ds.loadDataBlock();

A feature subset may be defined with the functions list(…) , range(…), all(), or allReverse() located in the namespace data_management::features. For example, you can use numerical or string identifiers to refer to the particular feature in the data set. A string identifier may correspond to a feature name (for instance, name in CSV header or in SQL table column name) and numerical one to the index of a feature. The following code block shows several ways to define a feature subset. f1 , f2, and f4 are the names of the respective columns in CSV file or SQL table, and the numbers 0, 2 - 4 are the indices of columns starting from the left one.

features::list("f1", "f2")   // String identifiers
features::list(0, 3);        // Numerical identifiers
features::list("f1", 2);     // Mixed identifiers
features::range(0, 4);       // Range of features, the same as list(0,…,4)
features::range("f1", "f4"); // Range with string identifiers
features::all();             // Refer to all features in the data set
features::allReverse()       // Like features::all() but in reverse order


// With STL vector
std::vector<features::IdFactory> fv;
fv.push_back("f2"); fv.push_back(3);
features::list(fv);

// With C++ 11 initializer list
features::list({ "f2", 3, "f1" });

We will use the term input features to refer to the columns of raw out-of-memory data and the term output features for the columns of numeric in-memory data. A feature modifier transforms specified input features subset to the output features. The number of output features is determined by the modifier. A feature modifier is expected to read the values corresponding to specified input features from the i-th row and write modified values to the i-th row of the output numeric table. In general case, feature modifier is able to process arbitrary number of input features to arbitrary number of output features. Let's assume that we added m modifiers along with the features subsets F 1,...F m and the j-th modifier has the C j output columns, where

are specified input features of interest,

are all possible features, p is the number of features in the input data. The output numeric table will contain

columns. The j-th feature modifier writes result to the columns starting with the index

, in particular the first feature modifier writes to the first C 1 columns, and the last to the last C m columns of the output table. The following picture demonstrates the case of two modifiers. Feature Modifier 1 reads the features f 1,f 3 from an input data set, performs data transformation and writes the result to the columns 1, 2 in the output numeric table. Feature Modifier 2 behaves similarly, but processes features f 2,f 5 and has 3 output features.
Intel® DAAL data source

The Intel® DAAL has several predefined feature modifiers available for CSV and SQL feature managers.

continuous - parses input values as real numbers, the number of output features is equal to the number of input features.
categorical - parses input values as categorical features (described above), the number of output features is equal to the number of input features.
automatic - automatically selects appropriate parsing scheme (continuous or categorical)
oneHotEncoder - apply one-hot-encoding scheme for input features, the number of output features is equal to the sum of unique values for features in the input data set.

NOTE

The present version of library doesn't provide predefined feature modifiers for handling ordinal features.

You can implement you own feature modifier by inheriting from FeatureModifierBase and overriding its methods. An example interface of user-defined feature modifier is shown in the code block bellow:

class MyFeatureModifier : public modifiers::csv::FeatureModifierBase 
{
public:
   virtual void initialize(modifiers::csv::Config &config);
   virtual void apply(modifiers::csv::Context &context);
   virtual void finalize(modifiers::csv::Config &config);
};

Use the addModifier(…) method to add the user-defined modifier to the feature manager:

ds.getFeatureManager().addModifier( 
   features::list(0, 3), modifiers::custom<MyFeatureModifier>()
);

Feature modifier's lifetime consists of three stages:

Initialization. Feature manager performs modifier initialization by calling the initialize method. The Config class provides methods to change configuration of the modifier. For example use the Config::setNumberOfOutputFeatures(…) to adjust numbers of output features produced by the modifier. By default, the number of output feature is equal to the number of input features.
Applying loop. Feature manager calls the apply method for every row in the input data set, information about the current row is provided via context object. To implement this method, you need to get the input data from the context, carry out desired transformation and write result back to the context output buffer. You can get the output buffer by calling the Context::getOutputBuffer() method, the buffer's size must be equal to the number of output features you specified at the initialization stage.
Finalization. Finalization happens when feature manager calls the finalize method with the same config object passed at the initialization stage. For example, you may use this method to release intermediate buffers when the data transformation is done.

Note that exact set of methods available for Config and Context depends on the data source type. Please refer to Developer Reference to get detailed information about supported methods.

Examples

C++:

datasource_mysql.cpp
datasource_kdb.cpp
simple_csv_feature_modifiers.cpp
custom_csv_feature_modifiers.cpp

看一个定制modifier的例子：

/* file: custom_csv_feature_modifiers.cpp */
/*******************************************************************************
* Copyright 2014-2018 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

/*
!  Content:
!    C++ example of modifiers usage with file data source
!******************************************************************************/

/**
 * <a name="DAAL-EXAMPLE-CPP-DATASOURCE_CUSTOM_CSV_FEATURE_MODIFIERS">
 * example custom_csv_feature_modifiers.cpp
 */

#include <cassert>
#include <algorithm>

#include "daal.h"
#include "service.h"

using namespace daal::data_management;

/** User-defined feature modifier that computes a square for every feature */
class MySquaringModifier : public modifiers::csv::FeatureModifier
{
public:
    /* This method is called for every row in CSV file */
    virtual void apply(modifiers::csv::Context &context)
    {
        const size_t numberOfTokens = context.getNumberOfTokens();
        daal::services::BufferView<DAAL_DATA_TYPE> outputBuffer = context.getOutputBuffer();

        /* By default number of tokens (token is one word separated by commas) is equals to the
         * buffer size. This behavior can be redefined by calling 'setNumberOfOutputFeatures' on
         * initialization stage of the modifier (see 'MyMaxFeatureModifier') */
        assert(numberOfTokens == outputBuffer.size());

        for (size_t i = 0; i < numberOfTokens; i++)
        {
            const float x = context.getTokenAs<float>(i);
            outputBuffer[i] = x * x;
        }
    }
};

/** User-defined feature modifier that selects max element among all features  */
class MyMaxFeatureModifier : public modifiers::csv::FeatureModifier
{
public:
    /* This method is called once before CSV parsing */
    virtual void initialize(modifiers::csv::Config &config)
    {
        /* Set number of output features for the modifier. We assume modifier
         * computes function y = max { x_1, ..., x_n }, where x_i is input
         * features and y is output feature, so there is single output feature  */
        config.setNumberOfOutputFeatures(1);
    }

    /* This method is called for every row in CSV file */
    virtual void apply(modifiers::csv::Context &context)
    {
        const size_t numberOfTokens = context.getNumberOfTokens();

        /* Iterate throughout tokens, parse every token as float and compute max value  */
        float maxFeature = context.getTokenAs<float>(0);
        for (size_t i = 1; i < numberOfTokens; i++)
        {
            maxFeature = std::max(maxFeature, context.getTokenAs<float>(i));
        }

        /* Write max value to the output buffer, buffer size is equal to the
         * number of output features that specified in 'initialize' method */
        context.getOutputBuffer()[0] = maxFeature;
    }
};

int main(int argc, char *argv[])
{
    /* Path to the CSV to be read */
    const std::string csvFileName = "../data/batch/mixed_text_and_numbers.csv";

    checkArguments(argc, argv, 1, &csvFileName);

    /* Define options for CSV data source */
    const CsvDataSourceOptions csvOptions = CsvDataSourceOptions::allocateNumericTable |
                                            CsvDataSourceOptions::createDictionaryFromContext |
                                            CsvDataSourceOptions::parseHeader;

    /* Define CSV file data source */
    FileDataSource<CSVFeatureManager> ds(csvFileName, csvOptions);

    /* Configure format of output numeric table by applying modifiers.
     * Output numeric table will have the following format:
     * | Numeric1 | Numeric2 ^ 2 | Numeric5 ^ 2 | max(Numeric0, Numeric5) | */
    ds.getFeatureManager()
        .addModifier( features::list("Numeric1"), modifiers::csv::continuous() )
        .addModifier( features::list("Numeric2", "Numeric5"), modifiers::csv::custom<MySquaringModifier>() )
        .addModifier( features::list("Numeric0", "Numeric5"), modifiers::csv::custom<MyMaxFeatureModifier>() );

    /* Load and parse CSV file */
    ds.loadDataBlock();

    printNumericTable(ds.getNumericTable(), "Loaded numeric table:");

    return 0;
}

Parent topic: Data Management

相关阅读:
已知：每个飞机只有一个油箱，飞机之间可以相互加油（注意是相互，没有加油机）一箱油可供一架飞机绕地球飞半圈，问题：为使至少一架飞机绕地球一圈回到起飞时的飞机
 简易vector的实现
 简单的内存池实现
 归并排序，递归与非递归
 堆排序
 位运算
 二叉树的建立，以及非递归遍历
 “云端融合”思想的自我摸索（很不靠谱）
linux android开发环境搭建
 Android系统架构及内核简介
原文地址：https://www.cnblogs.com/bonelee/p/9913774.html