jydg 发表于 2019-1-30 14:10:50

Spark RDDs vs DataFrames vs SparkSQL-13101614

#!/usr/bin/env python  
from time import timefrom pyspark.sql import *from pyspark import SparkConf, SparkContext
  

  
conf = (SparkConf()
  
.setAppName("spark_sql_random_lookup")
  
.set("spark.executor.instances", "10")
  
.set("spark.executor.cores", 2)
  
.set("spark.dynamicAllocation.enabled", "false")
  
.set("spark.shuffle.service.enabled", "false")
  
.set("spark.executor.memory", "500MB"))
  
sc = SparkContext(conf = conf)
  

  
sqlContext = SQLContext(sc)
  

  
t0 = time()
  

  
path = "/data/customer_orders*"lines = sc.textFile(path)
  
## create data frameorders_df = sqlContext.createDataFrame( \
  
lines.map(lambda l: l.split("|")) \
  
.map(lambda p: Row(cust_id=int(p), order_id=int(p), email_hash=p, ssn_hash=p, product_id=int(p), product_desc=p, \
  
country=p, state=p, shipping_carrier=p, shipping_type=p, shipping_class=p) ) )
  
## register data frame as a temporary tableorders_df.registerTempTable("orders")
  
## filter where the customer_id, the first field, is equal to 96922894print sqlContext.sql("SELECT * FROM orders where order_id = 96922894").collect()
  

  
tt = str(time() - t0)print "SparkSQL performed in " + tt + " seconds"


页: [1]
查看完整版本: Spark RDDs vs DataFrames vs SparkSQL-13101614