scala> textFile.count() // Number of items in this Dataset res0: Long = 126 // May be different from yours as README.md will change over time, similar to other outputs
scala> textFile.first() // First item in this Dataset res1: String = # Apache Spark
textFile = spark.read.text("README.md")
textFile.count() # Number of rows in this DataFrame 126
textFile.first() # First row in this DataFrame Row(value=u'# Apache Spark')
filter过滤
linesWithSpark = textFile.filter(textFile.value.contains("Spark"))
拉链方式
textFile.filter(textFile.value.contains("Spark")).count() # How many lines contain "Spark"? 15 ```
进阶
object SimpleApp { def main(args: Array[String]) { val logFile = "YOURSPARKHOME/README.md" // Should be some file on your system val spark = SparkSession.builder.appName("Simple Application").getOrCreate() val logData = spark.read.textFile(logFile).cache() val numAs = logData.filter(line => line.contains("a")).count() val numBs = logData.filter(line => line.contains("b")).count() println(s"Lines with a: $numAs, Lines with b: $numBs") spark.stop() } } ``` 运行应用
Use spark-submit to run your application
$ YOURSPARKHOME/bin/spark-submit \ --class "SimpleApp" \ --master local[4] \ target/scala-2.11/simple-project_2.11-1.0.jar ... Lines with a: 46, Lines with b: 23 java分别统计包含单词a和单词b的行数 / SimpleApp.java / import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.Dataset;
public> long numAs = logData.filter(s -> s.contains("a")).count();
long numBs = logData.filter(s -> s.contains("b")).count();
System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
spark.stop();
} } ``` 运行应用
Use spark-submit to run your application
$ YOURSPARKHOME/bin/spark-submit \ --class "SimpleApp" \ --master local[4] \ target/simple-project-1.0.jar ... Lines with a: 46, Lines with b: 23 ```
``` """SimpleApp.py""" from pyspark.sql import SparkSession
logFile = "YOURSPARKHOME/README.md" # Should be some file on your system spark = SparkSession.builder().appName(appName).master(master).getOrCreate() logData = spark.read.text(logFile).cache()
print("Lines with a: %i, lines with b: %i" % (numAs, numBs))
spark.stop() ``` 运行应用
Use spark-submit to run your application
$ YOURSPARKHOME/bin/spark-submit \ --master local[4] \ SimpleApp.py ... Lines with a: 46, Lines with b: 23 ```
文章来自:https://www.itjmd.com/news/show-4240.html