daal安装(记得先安装anaconda):
git clone https://github.com/IntelPython/daal4py.git cd daal4py conda create -n DAAL4PY -c intel -c intel/label/test -c conda-forge python=3.6 mpich cnc tbb-devel daal daal-include cython jinja2 numpy source activate DAAL4PY export CNCROOT=$CONDA_PREFIX export TBBROOT=$CONDA_PREFIX export DAALROOT=$CONDA_PREFIX python setup.py build_ext python setup.py install # 运行后面的demo source deactivate DAAL4PY # 退出
注意:安装过程较慢,耐心等待。
随机森林:
#******************************************************************************* # Copyright 2014-2018 Intel Corporation # All Rights Reserved. # # This software is licensed under the Apache License, Version 2.0 (the # "License"), the following terms apply: # # You may not use this file except in compliance with the License. You may # obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # # See the License for the specific language governing permissions and # limitations under the License. #******************************************************************************* # daal4py Decision Forest Classification example for shared memory systems import daal4py as d4p import numpy as np # let's try to use pandas' fast csv reader try: import pandas read_csv = lambda f, c: pandas.read_csv(f, usecols=c, delimiter=',', header=None, dtype=np.float32).values except: # fall back to numpy loadtxt read_csv = lambda f, c: np.loadtxt(f, usecols=c, delimiter=',', ndmin=2, dtype=np.float32) def main(): # input data file infile = "./data/batch/df_classification_train.csv" testfile = "./data/batch/df_classification_test.csv" # Configure a training object (5 classes) train_algo = d4p.decision_forest_classification_training(5, nTrees=10, minObservationsInLeafNode=8, featuresPerNode=3, engine = d4p.engines_mt19937(seed=777), varImportance='MDI', bootstrap=True, resultsToCompute='computeOutOfBagError') # Read data. Let's use 3 features per observation data = read_csv(infile, range(3)) labels = read_csv(infile, range(3,4)) train_result = train_algo.compute(data, labels) # Traiing result provides (depending on parameters) model, outOfBagError, outOfBagErrorPerObservation and/or variableImportance # Now let's do some prediction predict_algo = d4p.decision_forest_classification_prediction(5) # read test data (with same #features) pdata = read_csv(testfile, range(3)) plabels = read_csv(testfile, range(3,4)) # now predict using the model from the training above predict_result = predict_algo.compute(pdata, train_result.model) # Prediction result provides prediction assert(predict_result.prediction.shape == (pdata.shape[0], 1)) return (train_result, predict_result, plabels) if __name__ == "__main__": (train_result, predict_result, plabels) = main() print(" Variable importance results: ", train_result.variableImportance) print(" OOB error: ", train_result.outOfBagError) print(" Decision forest prediction results (first 10 rows): ", predict_result.prediction[0:10]) print(" Ground truth (first 10 rows): ", plabels[0:10]) print('All looks good!')
demo示例数据:
0.00125126,0.563585,8,2, 0.193304,0.808741,12,1, 0.585009,0.479873,6,1, 0.350291,0.895962,13,4, 0.82284,0.746605,11,2, 0.174108,0.858943,12,0, 0.710501,0.513535,10,2, 0.303995,0.0149846,1,2, 0.0914029,0.364452,4,0, 0.147313,0.165899,0,4, 0.988525,0.445692,7,2, 0.119083,0.00466933,0,2, 0.0089114,0.37788,4,2, 0.531663,0.571184,10,3, 0.601764,0.607166,10,4, 0.166234,0.663045,8,4, 0.450789,0.352123,5,3, 0.0570391,0.607685,8,4, 0.783319,0.802606,15,3, 0.519883,0.30195,6,2, 0.875973,0.726676,11,1, 0.955901,0.925718,15,3, 0.539354,0.142338,2,3, 0.462081,0.235328,1,2, 0.862239,0.209601,3,1, 0.779656,0.843654,15,3, 0.996796,0.999695,15,2, 0.611499,0.392438,6,0, 0.266213,0.297281,5,2, 0.840144,0.0237434,3,1, 0.375866,0.0926237,1,0, 0.677206,0.0562151,2,3, 0.00878933,0.91879,12,2, 0.275887,0.272897,5,2, 0.587909,0.691183,10,4, 0.837611,0.726493,11,1, 0.484939,0.205359,1,2, 0.743736,0.468459,6,2, 0.457961,0.949156,13,3, 0.744438,0.10828,2,2, 0.599048,0.385235,6,0, 0.735008,0.608966,10,2, 0.572405,0.361339,6,0, 0.151555,0.225105,0,3, 0.425153,0.802881,13,3,
计算均值 方差等统计特征:
#******************************************************************************* # Copyright 2014-2018 Intel Corporation # All Rights Reserved. # # This software is licensed under the Apache License, Version 2.0 (the # "License"), the following terms apply: # # You may not use this file except in compliance with the License. You may # obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # # See the License for the specific language governing permissions and # limitations under the License. #******************************************************************************* # daal4py low order moments example for shared memory systems import daal4py as d4p import numpy as np # let's try to use pandas' fast csv reader try: import pandas read_csv = lambda f, c: pandas.read_csv(f, usecols=c, delimiter=',', header=None, dtype=np.float64).values except: # fall back to numpy loadtxt read_csv = lambda f, c: np.loadtxt(f, usecols=c, delimiter=',', ndmin=2) def main(): # read data from file file = "./data/batch/covcormoments_dense.csv" data = read_csv(file, range(10)) # compute alg = d4p.low_order_moments() res = alg.compute(data) # result provides minimum, maximum, sum, sumSquares, sumSquaresCentered, # mean, secondOrderRawMoment, variance, standardDeviation, variation assert res.minimum.shape == (1, data.shape[1]) assert res.maximum.shape == (1, data.shape[1]) assert res.sum.shape == (1, data.shape[1]) assert res.sumSquares.shape == (1, data.shape[1]) assert res.sumSquaresCentered.shape == (1, data.shape[1]) assert res.mean.shape == (1, data.shape[1]) assert res.secondOrderRawMoment.shape == (1, data.shape[1]) assert res.variance.shape == (1, data.shape[1]) assert res.standardDeviation.shape == (1, data.shape[1]) assert res.variation.shape == (1, data.shape[1]) return res if __name__ == "__main__": res = main() # print results print(" Minimum: ", res.minimum) print(" Maximum: ", res.maximum) print(" Sum: ", res.sum) print(" Sum of squares: ", res.sumSquares) print(" Sum of squared difference from the means: ", res.sumSquaresCentered) print(" Mean: ", res.mean) print(" Second order raw moment: ", res.secondOrderRawMoment) print(" Variance: ", res.variance) print(" Standard deviation: ", res.standardDeviation) print(" Variation: ", res.variation) print('All looks good!')