import findspark
findspark.init()
import pyspark


conf = pyspark.SparkConf().setAppName("My First Spark RDD APP")   #.setMaster("local")  # “yarn”
sc = pyspark.SparkContext(conf=conf)


sc.stop()


sc = pyspark.SparkContext.getOrCreate()


data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)
distData

data2 = [11, 21, 31, 41, 51]
distData2 = sc.parallelize(data2)
distData2

ParallelCollectionRDD[12] at parallelize at PythonRDD.scala:195


licenseFile = sc.textFile("/package/spark-2.4.4-bin-hadoop2.7/licenses/LICENSE-vis.txt")
licenseFile

/opt/apps/ecm/service/spark/2.4.4/package/spark-2.4.4-bin-hadoop2.7/licenses/LICENSE-vis.txt MapPartitionsRDD[14] at textFile at NativeMethodAccessorImpl.java:0


lineLengths = licenseFile.map(lambda s: len(s))
totalLength = lineLengths.reduce(lambda a, b: a + b)


lineLengths

PythonRDD[4] at RDD at PythonRDD.scala:53


totalLength

385


lineLengths.persist()

PythonRDD[4] at RDD at PythonRDD.scala:53


lineLengths.cache() # same as persist() but use default storage level

PythonRDD[4] at RDD at PythonRDD.scala:53


lineLengths.unpersist()

PythonRDD[4] at RDD at PythonRDD.scala:53


broadcastVar = sc.broadcast([1, 2, 3])
broadcastVar

<pyspark.broadcast.Broadcast at 0x7f5067b011d0>


broadcastVar.value

[1, 2, 3]


accum = sc.accumulator(0)
sc.parallelize([1, 2, 3, 4]).foreach(lambda x: accum.add(x))
accum.value

10


import numpy as np
import scipy.sparse as sps
 
# Use a NumPy array as a dense vector.
dv1 = np.array([1.0, 0.0, 3.0])
# Use a Python list as a dense vector.
dv2 = [1.0, 0.0, 3.0]
# Create a SparseVector.
sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0])
# Use a single-column SciPy csc_matrix as a sparse vector.
sv2 = sps.csc_matrix((np.array([1.0, 3.0]),
                      np.array([0, 2]),
                      np.array([0, 2])), shape=(3, 1))


from pyspark.mllib.linalg import Matrix, Matrices
# Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])
print(dm2)
# Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
print(sm)

DenseMatrix([[1., 4.],
             [2., 5.],
             [3., 6.]])
3 X 2 CSCMatrix
(0,0) 9.0
(2,1) 6.0
(1,1) 8.0


from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Create a labeled point with a positive label and a dense feature vector.
pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])

# Create a labeled point with a negative label and a sparse feature vector.
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))


from pyspark.mllib.util import MLUtils
examples = MLUtils.loadLibSVMFile(sc,
           "/opt/apps/ecm/service/spark/2.4.4/package/spark-2.4.4-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt")
print(examples)


from pyspark.mllib.linalg.distributed import RowMatrix
# Create an RDD of vectors.
rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
# Create a RowMatrix from an RDD of vectors.
mat = RowMatrix(rows)
# Get its size.
m = mat.numRows()  # 4
n = mat.numCols()  # 3
# Get the rows as an RDD of vectors again.
rowsRDD = mat.rows
print(m,n,rowsRDD)


from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
# Create an RDD of indexed rows.
#   - This can be done explicitly with the IndexedRow class:
indexedRows = sc.parallelize([IndexedRow(0, [1, 2, 3]),
                              IndexedRow(1, [4, 5, 6]),
                              IndexedRow(2, [7, 8, 9]),
                              IndexedRow(3, [10, 11, 12])])
#   - or by using (long, vector) tuples:
indexedRows = sc.parallelize([(0, [1, 2, 3]), (1, [4, 5, 6]),
                              (2, [7, 8, 9]), (3, [10, 11, 12])])


from pyspark.mllib.linalg import Matrices
from pyspark.mllib.linalg.distributed import BlockMatrix

# Create an RDD of sub-matrix blocks.
blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),
                         ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])

# Create a BlockMatrix from an RDD of sub-matrix blocks.
mat = BlockMatrix(blocks, 3, 2)

# Get its size.
m = mat.numRows()  # 6
n = mat.numCols()  # 2

# Get the blocks as an RDD of sub-matrix blocks.
blocksRDD = mat.blocks

# Convert to a LocalMatrix.
localMat = mat.toLocalMatrix()

# Convert to an IndexedRowMatrix.
indexedRowMat = mat.toIndexedRowMatrix()

# Conver t to a Coordinate  Matrix.
coordinateMat = mat.toCoordinateMatrix()

Datasets and Parallelization with Spark¶

Feng Li¶

Central University of Finance and Economics¶

feng.li@cufe.edu.cn ¶

Course home page: https://feng.li/distcomp ¶

Two available APIs in Spark¶

Resilient distributed dataset RDD¶

Create RDDs¶

Create a SparkContext object¶

Parallelized collections¶

External Datasets with RDD¶

External Datasets with RDD¶

RDD Operations¶

Simple MapReduce¶

RDD Persistence¶

RDD different storage level¶

RDD Unpersist¶

Broadcast Variables¶

Accumulators¶

Spark Lazy Evaluation¶

Spark Data Structures¶

Distributed matrix¶

Lab¶

Datasets and Parallelization with Spark¶

Feng Li¶

Central University of Finance and Economics¶

feng.li@cufe.edu.cn¶

Course home page: https://feng.li/distcomp¶

Two available APIs in Spark¶

Resilient distributed dataset RDD¶

Create RDDs¶

Create a SparkContext object¶

Parallelized collections¶

External Datasets with RDD¶

External Datasets with RDD¶

RDD Operations¶

Simple MapReduce¶

RDD Persistence¶

RDD different storage level¶

RDD Unpersist¶

Broadcast Variables¶

Accumulators¶

Spark Lazy Evaluation¶

Spark Data Structures¶

Distributed matrix¶

Lab¶

feng.li@cufe.edu.cn ¶

Course home page: https://feng.li/distcomp ¶