pom.xml:
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.ultiwill</groupId> <artifactId>StructuredStreaming</artifactId> <version>1.0-SNAPSHOT</version> <dependencies> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>2.11.8</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>2.2.0</version> </dependency> </dependencies> <build> <!--指定打包的位置,默认只打src/main/java目录,且只能打包一个目录--> <sourceDirectory>src/main/scala</sourceDirectory> <testSourceDirectory>src/test/scala</testSourceDirectory> <plugins> <!--指定java版本--> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.0</version> <configuration> <source>1.8</source> <target>1.8</target> <encoding>UTF-8</encoding> </configuration> </plugin> <!-- 没有该插件的话,在scala中无法找到java的类 --> <plugin> <groupId>org.codehaus.mojo</groupId> <artifactId>build-helper-maven-plugin</artifactId> <version>3.0.0</version> <executions> <execution> <id>add-source</id> <phase>generate-sources</phase> <goals> <goal>add-source</goal> </goals> <configuration> <sources> <source>src/main/java</source> </sources> </configuration> </execution> </executions> </plugin> <!--scala依赖插件,为scala提供支持--> <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> <version>3.2.0</version> <executions> <execution> <goals> <goal>compile</goal> <goal>testCompile</goal> </goals> <configuration> <args> <arg>-dependencyfile</arg> <arg>${project.build.directory}/.scala_dependencies</arg> </args> </configuration> </execution> </executions> </plugin> <!--把所有jar包集成到一个jar包中--> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>3.1.1</version> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <!--去掉META-INF文件中可能出现的非法签名文件--> <filters> <filter> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <mainClass>com.ultiwill.structured.SocketWordCount</mainClass> </transformer> </transformers> </configuration> </execution> </executions> </plugin> </plugins> </build> </project>
完整模式 SocketWordCount:
package com.ultiwill.structured import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime} import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} /** * * @author chong.zuo * @date 2020/8/7 15:03 * */ object SocketWordCount { def main(args: Array[String]): Unit = { //创建sparkSession val spark = SparkSession.builder() .master("local[6]") .appName("socket_structured") .getOrCreate() //调整 Log 级别, 避免过多的 Log 影响视线 spark.sparkContext.setLogLevel("ERROR") //隐式转换,DF转DS import spark.implicits._ //数据集生成, 数据读取 val source: DataFrame = spark.readStream .format("socket") .option("host", "192.168.100.110") .option("port", "9999") .load() val sourceDS: Dataset[String] = source.as[String] //数据处理 val words = sourceDS.flatMap(_.split(" ")) //.withWatermark() .groupBy("value") .count() //生成结果集输出 words.writeStream .outputMode(OutputMode.Complete()) .format("console") .start() .awaitTermination() } }
追加模式 SocketWordCount2:
追加模式时,进行聚合操作需要设置窗口window
package com.ultiwill.structured import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.spark.sql.functions._ import org.apache.spark.sql.streaming.OutputMode /** * * @author chong.zuo * @date 2020/8/7 17:24 * */ object SocketWordCount2 { def main(args: Array[String]): Unit = { val sparkSession = SparkSession.builder .master("local[2]") .appName("example") .getOrCreate() //create stream from socket import sparkSession.implicits._ sparkSession.sparkContext.setLogLevel("ERROR") val socketStreamDs = sparkSession.readStream .format("socket") .option("host", "192.168.100.110") .option("port", 9999) .load() .as[String] val stockDs = socketStreamDs.map(value => (value.trim.split(","))).map(entries => (new java.sql.Timestamp(entries(0).toLong), entries(1), entries(2).toDouble)).toDF("time", "symbol", "value") val windowedCount = stockDs .withWatermark("time", "20000 milliseconds") .groupBy( window($"time", "10 seconds"), $"symbol" ) .agg(sum("value"), count($"symbol")) windowedCount.writeStream .outputMode(OutputMode.Append()) .format("console") //定"truncate"只是为了在控制台输出时,不进行列宽度自动缩小。 .option("truncate", "false") .start() .awaitTermination() } }