{
"cells": [
{
"cell_type": "markdown",
"id": "5442af61",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Introduction to Spark Visualization\n",
"\n",
"## Feng Li\n",
"\n",
"### Central University of Finance and Economics\n",
"\n",
"### [feng.li@cufe.edu.cn](feng.li@cufe.edu.cn)\n",
"### Course home page: [https://feng.li/distcomp](https://feng.li/distcomp)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c179d02b",
"metadata": {
"scrolled": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/fli/.APP/spark/python/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n",
" warnings.warn(\n",
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
"23/06/08 17:14:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
]
}
],
"source": [
"import findspark ## Only needed when you run spark witin Jupyter notebook\n",
"findspark.init()\n",
"import pyspark\n",
"from pyspark.sql import SparkSession\n",
"spark = SparkSession.builder\\\n",
" .config(\"spark.executor.memory\", \"2g\")\\\n",
" .config(\"spark.cores.max\", \"2\")\\\n",
" .master(\"spark://master:7077\")\\\n",
" .appName(\"Python Spark\").getOrCreate() # using spark server\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "50d86587",
"metadata": {
"scrolled": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" \r"
]
},
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"fillpattern": {
"shape": ""
},
"hovertemplate": "variable=sales
index=%{x}
value=%{y}