Spark RDDs vs DataFrames vs SparkSQL-13101614

jydg 发表于 2019-1-30 14:10:50

#!/usr/bin/env python　　
from time import timefrom pyspark.sql import *from pyspark import SparkConf, SparkContext
　　

　　
conf = (SparkConf()
　　
.setAppName("spark_sql_random_lookup")
　　
.set("spark.executor.instances", "10")
　　
.set("spark.executor.cores", 2)
　　
.set("spark.dynamicAllocation.enabled", "false")
　　
.set("spark.shuffle.service.enabled", "false")
　　
.set("spark.executor.memory", "500MB"))
　　
sc = SparkContext(conf = conf)
　　

　　
sqlContext = SQLContext(sc)
　　

　　
t0 = time()
　　

　　
path = "/data/customer_orders*"lines = sc.textFile(path)
　　
## create data frameorders_df = sqlContext.createDataFrame( \
　　
lines.map(lambda l: l.split("|")) \
　　
.map(lambda p: Row(cust_id=int(p), order_id=int(p), email_hash=p, ssn_hash=p, product_id=int(p), product_desc=p, \
　　
country=p, state=p, shipping_carrier=p, shipping_type=p, shipping_class=p) ) )
　　
## register data frame as a temporary tableorders_df.registerTempTable("orders")
　　
## filter where the customer_id, the first field, is equal to 96922894print sqlContext.sql("SELECT * FROM orders where order_id = 96922894").collect()
　　

　　
tt = str(time() - t0)print "SparkSQL performed in " + tt + " seconds"

页: [1]

运维网's Archiver

Spark RDDs vs DataFrames vs SparkSQL-13101614