import findspark ## Only needed when you run spark witin Jupyter notebook
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.config("spark.executor.memory", "2g")\
.config("spark.cores.max", "2")\
.master("spark://master:7077")\
.appName("Python Spark").getOrCreate() # using spark server
/home/fli/.APP/spark/python/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched. warnings.warn( Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 23/06/08 17:14:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
import pyspark.pandas as ps
import pandas as pd
df = ps.DataFrame({
'sales': [3, 2, 3, 9, 10, 6],
'signups': [5, 5, 6, 12, 14, 13],
'visits': [20, 42, 28, 62, 81, 50],
})
df.sales.plot.area()
speed = [0.1, 17.5, 40, 48, 52, 69, 88]
lifespan = [2, 8, 70, 1.5, 25, 12, 28]
index = ['snail', 'pig', 'elephant',
'rabbit', 'giraffe', 'coyote', 'horse']
df = ps.DataFrame({'speed': speed,
'lifespan': lifespan}, index=index)
df.plot.barh()
df = ps.DataFrame({
'x': [1, 2, 2.5, 3, 3.5, 4, 5],
'y': [4, 4, 4.5, 5, 5.5, 6, 6],
})
df.plot.kde(bw_method=3)
23/06/08 17:14:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
import numpy as np
df = pd.DataFrame(
np.random.randint(1, 7, 6000),
columns=['one'])
df['two'] = df['one'] + np.random.randint(1, 7, 6000)
df = ps.from_pandas(df)
df.plot.hist(bins=12, alpha=0.5)
data = np.random.randn(25, 4)
df = ps.DataFrame(data, columns=list('ABCD'))
df['A'].plot.box()
df = ps.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
[6.4, 3.2, 1], [5.9, 3.0, 2]],
columns=['length', 'width', 'species'])
fig = df.plot.scatter(x='length', y='width')
fig.update_layout(template="plotly_dark")
df = ps.DataFrame({'mass': [0.330, 4.87, 5.97],
'radius': [2439.7, 6051.8, 6378.1]},
index=['Mercury', 'Venus', 'Earth'])
df.plot.pie(y='mass')
airdelay_small.csv
data to visualize the delay trend.