• spark 数据分析


    //练习Javardd和dataframe之间的转换流程

    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.Function;
    import org.apache.spark.api.java.function.VoidFunction;
    import org.apache.spark.sql.Dataset;
    import org.apache.spark.sql.Row;
    import org.apache.spark.sql.RowFactory;
    import org.apache.spark.sql.SQLContext;
    import org.apache.spark.sql.types.DataTypes;
    import org.apache.spark.sql.types.StructField;
    import org.apache.spark.sql.types.StructType;
    
    import java.util.Arrays;
    import java.util.Iterator;
    import java.util.List;
    
    /**
     *
     * @author 雪瞳
     * @Slogan 时钟尚且前行,人怎能再次止步!
     * @Function
     *
     */
    public class DataFreameTest {
        public static void main(String[] args) {
            String master = "local";
            String appName = "data";
            SparkConf conf = new SparkConf().setAppName(appName).setMaster(master);
            JavaSparkContext sc = new JavaSparkContext(conf);
            sc.setLogLevel("error");
            SQLContext sqlContext = new SQLContext(sc);
    
            String path = "./data/df.txt";
            //读取文本文件内容 返回JavaRDD
            JavaRDD<String> textRDD = sc.textFile(path);
            //将文本文件内容生成一个迭代器返回 map是一对一进行数据操作
            JavaRDD<Iterator<String>> iteratorJavaRDD = textRDD.map(new Function<String, Iterator<String>>() {
                @Override
                public Iterator<String> call(String line) throws Exception {
                    String[] words = line.split(" ");
                    List<String> list = Arrays.asList(words);
                    return list.iterator();
                }
            });
            //遍历
            iteratorJavaRDD.foreach(new VoidFunction<Iterator<String>>() {
                @Override
                public void call(Iterator<String> stringIterator) throws Exception {
                    while (stringIterator.hasNext()){
                        System.out.println(stringIterator.next());
                    }
                }
            });
            System.out.println("-------------------------------------------------");
            //将javaRDD转换成 RowRDD 后通过schema映射成DataFrame类型
            JavaRDD<Row> rowRdd = textRDD.map(new Function<String, Row>() {
                @Override
                public Row call(String line) throws Exception {
                    String[] words = line.split(" ");
                    return RowFactory.create(
                            words[0],
                            Integer.valueOf(words[1])
                    );
                }
            });
            //设置Struct类型
            List<StructField> asList = Arrays.asList(
                    DataTypes.createStructField("name", DataTypes.StringType, true),
                    DataTypes.createStructField("score", DataTypes.IntegerType,true)
            );
            //进行映射
            StructType schema = DataTypes.createStructType(asList);
            Dataset<Row> df = sqlContext.createDataFrame(rowRdd, schema);
            df.show();
            //设置虚拟表进行数据遍历
            System.out.println("--------------------------------------------");
            df.createOrReplaceTempView("student");
            String sqlText = "select name,score from student where score>70";
            sqlContext.sql(sqlText).show();
        }
    }
    

      

  • 相关阅读:
    HA分布式集群二hive配置
    win下写任务提交给集群
    win10下将spark的程序提交给远程集群中运行
    Scala快学笔记(三)
    Scala快学笔记(二)
    Scala快学笔记(一)
    统计学习方法 三 kNN
    统计学习方法 二 感知机
    fluent python(一)
    Codewar (1)
  • 原文地址:https://www.cnblogs.com/walxt/p/12751410.html
Copyright © 2020-2023  润新知