this code will count the number of words in a text file.
package geo1.op1; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2; import java.util.Arrays; public class WordCount { private static final FlatMapFunction<String, String> WORDS_EXTRACTOR = new FlatMapFunction<String, String>() { // @Override public Iterable<String> call(String s) throws Exception { return Arrays.asList(s.split(" ")); } }; private static final PairFunction<String, String, Integer> WORDS_MAPPER = new PairFunction<String, String, Integer>() { // @Override public Tuple2<String, Integer> call(String s) throws Exception { return new Tuple2<String, Integer>(s, 1); } }; private static final Function2<Integer, Integer, Integer> WORDS_REDUCER = new Function2<Integer, Integer, Integer>() { // @Override public Integer call(Integer a, Integer b) throws Exception { return a + b; } }; public static void main(String[] args) { // if (args.length < 1) { // System.err // .println("Please provide the input file full path as argument"); // System.exit(0); // } String inputPath="/home/steve/Documents/words.txt"; String outputFolder="/home/steve/Documents/ans"; SparkConf conf = new SparkConf().setAppName( "org.sparkexample.WordCount").setMaster("local"); JavaSparkContext context = new JavaSparkContext(conf); JavaRDD<String> file = context.textFile(inputPath); JavaRDD<String> words = file.flatMap(WORDS_EXTRACTOR); JavaPairRDD<String, Integer> pairs = words.mapToPair(WORDS_MAPPER); JavaPairRDD<String, Integer> counter = pairs.reduceByKey(WORDS_REDUCER); counter.saveAsTextFile(outputFolder); } }
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>geo1</groupId> <artifactId>op1</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>op1</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>1.2.0</version> </dependency> </dependencies> </project>