SparkConf sparkConf = new SparkConf(); sparkConf .setAppName("Internal_Func") .setMaster("local"); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf); SQLContext sqlContext = new SQLContext(javaSparkContext); List<String> list = new ArrayList<String>(); list.add("1,1"); list.add("2,11"); list.add("2,111"); list.add("2,111"); list.add("3,1111"); list.add("3,11111"); JavaRDD<String> rdd_str = javaSparkContext.parallelize(list, 5); JavaRDD<Row> rdd_row = rdd_str.map(new Function<String, Row>() { @Override public Row call(String v1) throws Exception { String ary[] = v1.split(","); return RowFactory.create(ary[0], Long.parseLong(ary[1])); } }); List<StructField> fieldList = new ArrayList<StructField>(); fieldList.add(DataTypes.createStructField("name", DataTypes.StringType, true)); fieldList.add(DataTypes.createStructField("sc", DataTypes.LongType, true)); StructType tmp = DataTypes.createStructType(fieldList); DataFrame df = sqlContext.createDataFrame(rdd_row, tmp); df.registerTempTable("tmp_sc"); DataFrame df_agg = sqlContext.sql("select name,count(distinct(sc)) from tmp_sc group by name");//去重后分组求和统计 df_agg.show();