• spark MLlib BasicStatistics 统计学基础


    一, jar依赖,jsc创建。

    package ML.BasicStatistics;
    
    import com.google.common.collect.Lists;
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaDoubleRDD;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.DoubleFlatMapFunction;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.api.java.function.PairFunction;
    import org.apache.spark.api.java.function.VoidFunction;
    import org.apache.spark.mllib.linalg.Matrices;
    import org.apache.spark.mllib.linalg.Matrix;
    import org.apache.spark.mllib.linalg.Vector;
    import org.apache.spark.mllib.linalg.Vectors;
    import org.apache.spark.mllib.regression.LabeledPoint;
    import org.apache.spark.mllib.stat.KernelDensity;
    import org.apache.spark.mllib.stat.MultivariateStatisticalSummary;
    import org.apache.spark.mllib.stat.Statistics;
    import org.apache.spark.mllib.stat.test.ChiSqTestResult;
    import org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult;
    import org.apache.spark.mllib.util.MLUtils;
    import org.apache.spark.rdd.RDD;
    import scala.Tuple2;
    import scala.runtime.Statics;
    import static org.apache.spark.mllib.random.RandomRDDs.*;
    
    import java.util.*;
    
    /**
     * TODO
     *
     * @ClassName: BasicStatistics
     * @author: DingH
     * @since: 2019/4/3 16:11
     */
    public class BasicStatistics {
        public static void main(String[] args) {
            System.setProperty("hadoop.home.dir","E:\hadoop-2.6.5");
            SparkConf conf = new SparkConf().setAppName("BasicStatistics").setMaster("local");
            JavaSparkContext jsc = new JavaSparkContext(conf);

    二。Summary statistics

            /**
             * @Title: Statistics.colStats一个实例MultivariateStatisticalSummary,其中包含按列的max,min,mean,variance和非零数,以及总计数
             * Summary statistics:摘要统计
             */
            JavaRDD<Vector> parallelize = jsc.parallelize(Arrays.asList(
                    Vectors.dense(1, 0, 3),
                    Vectors.dense(2, 0, 4),
                    Vectors.dense(3, 0, 5)
            ));
            MultivariateStatisticalSummary summary = Statistics.colStats(parallelize.rdd());
            System.out.println(summary.mean());
            System.out.println(summary.variance());
            System.out.println(summary.numNonzeros());

    三。Correlations:相关性

            /**
             * @Title: Correlations:相关性
             */
            JavaRDD<Tuple2<String, String>> parallelize = jsc.parallelize(Lists.newArrayList(
                    new Tuple2<String, String>("cat", "11"),
                    new Tuple2<String, String>("dog", "22"),
                    new Tuple2<String, String>("cat", "33"),
                    new Tuple2<String, String>("pig", "44")
    
            ));
    
            JavaDoubleRDD seriesX  = parallelize.mapPartitionsToDouble(new DoubleFlatMapFunction<Iterator<Tuple2<String, String>>>() {
                public Iterable<Double> call(Iterator<Tuple2<String, String>> tuple2Iterator) throws Exception {
                    ArrayList<Double> strings = new ArrayList<Double>();
                    while (tuple2Iterator.hasNext()){
                        strings.add(Double.parseDouble(tuple2Iterator.next()._2));
                    }
                    return strings;
                }
            });
            JavaDoubleRDD seriesY  = parallelize.mapPartitionsToDouble(new DoubleFlatMapFunction<Iterator<Tuple2<String, String>>>() {
                public Iterable<Double> call(Iterator<Tuple2<String, String>> tuple2Iterator) throws Exception {
                    ArrayList<Double> strings = new ArrayList<Double>();
                    while (tuple2Iterator.hasNext()){
                        strings.add(Double.parseDouble(tuple2Iterator.next()._2)+1);
                    }
                    return strings;
                }
            });
             //compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a
             //method is not specified, Pearson's method will be used by default.
            double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
    
            
            
            
            JavaRDD<Vector> parallelize11 = jsc.parallelize(Arrays.asList(
                    Vectors.dense(1, 0, 3),
                    Vectors.dense(2, 0, 4),
                    Vectors.dense(3, 0, 5)
            ));// note that each Vector is a row and not a column
            Matrix correlation2 = Statistics.corr(parallelize11.rdd(), "spearman");
            System.out.println(correlation2);

    三,Stratified sampling:分层抽样

            /**
             * @Title: Stratified sampling:分层抽样
             */
            JavaRDD<Tuple2<String, String>> parallelize = jsc.parallelize(Lists.newArrayList(
                    new Tuple2<String, String>("cat", "11"),
                    new Tuple2<String, String>("dog", "22"),
                    new Tuple2<String, String>("cat", "33"),
                    new Tuple2<String, String>("pig", "44")
    
            ));
            JavaPairRDD data = parallelize.mapToPair(new PairFunction<Tuple2<String, String>, String, String>() {
                public Tuple2<String, String> call(Tuple2<String, String> stringStringTuple2) throws Exception {
                    return new Tuple2<String, String>(stringStringTuple2._1, stringStringTuple2._2);
                }
            });    // an RDD of any key value pairs
            Map<String, Double> fractions = new HashMap<String, Double>(); // specify the exact fraction desired from each key
            fractions.put("cat",0.5);    //对于每个key取值的概率
            fractions.put("dog",0.8);
            fractions.put("pig",0.8);
            // Get an exact sample from each stratum
            JavaPairRDD approxSample  = data.sampleByKey(false, fractions);
            JavaPairRDD exactSample = data.sampleByKeyExact(false, fractions);
            approxSample.foreach(new VoidFunction() {
                public void call(Object o) throws Exception {
                    System.out.println(o);
                }
            });

    四。Hypothesis testing  假设检验

            /**
             * @Title: Hypothesis testing  假设检验
             */
    
            Vector vec = Vectors.dense(1,2,3,4); // a vector composed of the frequencies of events
    
            // compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
            // the test runs against a uniform distribution.
            ChiSqTestResult goodnessOfFitTestResult = Statistics.chiSqTest(vec);
            // summary of the test including the p-value, degrees of freedom, test statistic, the method used,
            // and the null hypothesis.
            System.out.println(goodnessOfFitTestResult);
    
            Matrix mat = Matrices.dense(3,2,new double[]{1,2,3,4,5,6}); // a contingency matrix
    
            // conduct Pearson's independence test on the input contingency matrix
            ChiSqTestResult independenceTestResult = Statistics.chiSqTest(mat);
            // summary of the test including the p-value, degrees of freedom...
            System.out.println(independenceTestResult);
    
            JavaRDD<LabeledPoint> obs = MLUtils.loadLibSVMFile(jsc.sc(), "/data...").toJavaRDD(); // an RDD of labeled points
    
            // The contingency table is constructed from the raw (feature, label) pairs and used to conduct
            // the independence test. Returns an array containing the ChiSquaredTestResult for every feature
            // against the label.
            ChiSqTestResult[] featureTestResults = Statistics.chiSqTest(obs.rdd());
            int i = 1;
            for (ChiSqTestResult result : featureTestResults) {
                System.out.println("Column " + i + ":");
                System.out.println(result); // summary of the test
                i++;
            }
    
            JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.2, 1.0,0.3));
            KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data,"norm");
            // summary of the test including the p-value, test statistic,
            // and null hypothesis
            // if our p-value indicates significance, we can reject the null hypothesis
            System.out.println(testResult);        

    五。Random data generation  

             /**
             * @Title: Random data generation  :uniform, standard normal, or Poisson.
             */
    
            JavaDoubleRDD u = normalJavaRDD(jsc, 100,2);
            // Apply a transform to get a random double RDD following `N(1, 4)`.
            JavaRDD<Double> map = u.map(new Function<Double, Double>() {
                public Double call(Double aDouble) throws Exception {
                    return 1.0 + 2.0 * aDouble;
                }
            });
            map.foreach(new VoidFunction<Double>() {
                public void call(Double aDouble) throws Exception {
                    System.out.println(aDouble);
                }
            });

    六。Kernel density estimation

            /**
             * @Title: Kernel density estimation
             */
            JavaRDD<Double> data = jsc.parallelize(Arrays.asList(1.0, 2.0, 3.0));// an RDD of sample data
    
            // Construct the density estimator with the sample data and a standard deviation for the Gaussian
            // kernels
            KernelDensity kd = new KernelDensity()
              .setSample(data)
              .setBandwidth(3.0);
    
            // Find density estimates for the given values
            double[] densities = kd.estimate(new double[] {-1.0, 2.0, 5.0});
            for (int i = 0; i < densities.length; i++) {
                System.out.println(densities[i]);
            }
  • 相关阅读:
    操作系统简介
    计算机基础
    Django之form
    CMDB资产采集
    Git
    User model
    多级评论
    个人主页
    media路径设置
    Web框架
  • 原文地址:https://www.cnblogs.com/dhName/p/10655450.html
Copyright © 2020-2023  润新知