{ "cells": [ { "cell_type": "markdown", "id": "5442af61", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Introduction to Spark Visualization\n", "\n", "## Feng Li\n", "\n", "### Central University of Finance and Economics\n", "\n", "### [feng.li@cufe.edu.cn](feng.li@cufe.edu.cn)\n", "### Course home page: [https://feng.li/distcomp](https://feng.li/distcomp)" ] }, { "cell_type": "code", "execution_count": 3, "id": "c179d02b", "metadata": { "scrolled": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/fli/.APP/spark/python/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.\n", " warnings.warn(\n", "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", "23/06/08 17:14:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" ] } ], "source": [ "import findspark ## Only needed when you run spark witin Jupyter notebook\n", "findspark.init()\n", "import pyspark\n", "from pyspark.sql import SparkSession\n", "spark = SparkSession.builder\\\n", " .config(\"spark.executor.memory\", \"2g\")\\\n", " .config(\"spark.cores.max\", \"2\")\\\n", " .master(\"spark://master:7077\")\\\n", " .appName(\"Python Spark\").getOrCreate() # using spark server\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "50d86587", "metadata": { "scrolled": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "data": { "text/html": [ "