import findspark ## Only needed when you run spark witin Jupyter notebook
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .config("spark.executor.memory", "2g")\
        .config("spark.cores.max", "2")\
        .master("spark://master:7077")\
        .appName("Python Spark").getOrCreate() # using spark server


spark # test if Spark session is created or not


sc = spark.sparkContext # make a spakr context for RDD

sc


# Load a text file and convert each line to a Row.
from pyspark.sql import Row
lines = sc.textFile("data/people.txt")
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
 
# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people)
schemaPeople

DataFrame[age: bigint, name: string]


# SQL can be run over DataFrames that have been registered as a table.
schemaPeople.createOrReplaceTempView("people")
teenagers

DataFrame[name: string]


teenagers.toPandas() # We could export the Spark DataFrame to a usual Pandas DataFrame


sdf = spark.read.csv("data/people.txt") # read hdfs file 
sdf.show() # Displays the content of the DataFrame to stdout

+-------+---+
|    _c0|_c1|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 19|
+-------+---+


# If your data are available locally, you could explicitly specify with "file://"
# But for this to work, the copy of the file needs to be on every worker or 
# every worker need to have access to common shared drive as in a NFS mount.

sdf = spark.read.csv("file:///home/fli/data/people.txt") # read hdfs file 
sdf.show() # Displays the content of the DataFrame to stdout


sdf2 = spark.read.json("data/people.json")
# Displays the content of the DataFrame to stdout
sdf2.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+


# Import data types
from pyspark.sql.types import *
# The schema is encoded in a string.

# Create a schema
schemaString = ["name", "age"]
fields = [StructField(field_name, StringType(), True) for field_name in schemaString]
schema = StructType(fields)


schema

StructType(List(StructField(name,StringType,true),StructField(age,StringType,true)))


sdf_withschema = spark.createDataFrame(people, schema)
sdf_withschema.show()

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 19|
+-------+---+


rdd1 = sdf_withschema.rdd
rdd1

MapPartitionsRDD[132] at javaToPython at NativeMethodAccessorImpl.java:0


sdf.write.mode('overwrite').csv("myspark/")# Save Spark DataFrame to a folder on the local disk.


import os 
os.listdir("myspark") # Let's check if everything is there on the local disk

['_SUCCESS',
 '.part-00000-23e89afa-747c-46dd-9a9b-f2ee8d79b94b-c000.csv.crc',
 '._SUCCESS.crc',
 'part-00000-23e89afa-747c-46dd-9a9b-f2ee8d79b94b-c000.csv']

Structured Data Processing with Spark¶

Feng Li¶

Central University of Finance and Economics¶

feng.li@cufe.edu.cn ¶

Course home page: https://feng.li/distcomp ¶

Spark SQL¶

Datasets and DataFrames¶

Spark Datasets¶

Spark DataFrame¶

Start a Spark session¶

Creating DataFrames¶

Convert an RDD to a DataFrame¶

Create Spark DataFrame directly from a file¶

Convert DataFrame to RDD¶

Export DataFrame to a local disk¶

Structured Data Processing with Spark¶

Feng Li¶

Central University of Finance and Economics¶

feng.li@cufe.edu.cn¶

Course home page: https://feng.li/distcomp¶

Spark SQL¶

Datasets and DataFrames¶

Spark Datasets¶

Spark DataFrame¶

Start a Spark session¶

Creating DataFrames¶

Convert an RDD to a DataFrame¶

Create Spark DataFrame directly from a file¶

Convert DataFrame to RDD¶

Export DataFrame to a local disk¶

feng.li@cufe.edu.cn ¶

Course home page: https://feng.li/distcomp ¶