{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Working with Spark DataFrame\n", "\n", "## Feng Li\n", "\n", "### Central University of Finance and Economics\n", "\n", "### [feng.li@cufe.edu.cn](feng.li@cufe.edu.cn)\n", "### Course home page: [https://feng.li/distcomp](https://feng.li/distcomp)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Start a Spark Session" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/html": [ "\n", "
SparkSession - in-memory
\n", " \n", "SparkContext
\n", "\n", " \n", "\n", "v2.4.5
local[*]
Python Spark with DataFrame
\n", " | DayOfWeek | \n", "ArrDelay | \n", "AirTime | \n", "Distance | \n", "
---|---|---|---|---|
0 | \n", "4.0 | \n", "2.0 | \n", "25.0 | \n", "127.0 | \n", "
1 | \n", "7.0 | \n", "29.0 | \n", "248.0 | \n", "1623.0 | \n", "
2 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
3 | \n", "5.0 | \n", "-2.0 | \n", "70.0 | \n", "451.0 | \n", "
4 | \n", "7.0 | \n", "11.0 | \n", "133.0 | \n", "1009.0 | \n", "
5 | \n", "7.0 | \n", "13.0 | \n", "177.0 | \n", "1562.0 | \n", "
6 | \n", "1.0 | \n", "-12.0 | \n", "181.0 | \n", "1589.0 | \n", "
7 | \n", "3.0 | \n", "11.0 | \n", "364.0 | \n", "2611.0 | \n", "
8 | \n", "5.0 | \n", "13.0 | \n", "53.0 | \n", "304.0 | \n", "
9 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
10 | \n", "5.0 | \n", "-8.0 | \n", "293.0 | \n", "2537.0 | \n", "
11 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
12 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
13 | \n", "2.0 | \n", "55.0 | \n", "285.0 | \n", "1927.0 | \n", "
14 | \n", "1.0 | \n", "23.0 | \n", "149.0 | \n", "991.0 | \n", "
15 | \n", "4.0 | \n", "64.0 | \n", "35.0 | \n", "193.0 | \n", "
16 | \n", "4.0 | \n", "29.0 | \n", "25.0 | \n", "77.0 | \n", "
17 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
18 | \n", "7.0 | \n", "-6.0 | \n", "91.0 | \n", "678.0 | \n", "
19 | \n", "7.0 | \n", "35.0 | \n", "127.0 | \n", "998.0 | \n", "
20 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
21 | \n", "2.0 | \n", "-7.0 | \n", "76.0 | \n", "508.0 | \n", "
22 | \n", "4.0 | \n", "60.0 | \n", "65.0 | \n", "370.0 | \n", "
23 | \n", "4.0 | \n", "-7.0 | \n", "66.0 | \n", "407.0 | \n", "
24 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
25 | \n", "3.0 | \n", "35.0 | \n", "313.0 | \n", "2421.0 | \n", "
26 | \n", "2.0 | \n", "-7.0 | \n", "137.0 | \n", "1121.0 | \n", "
27 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
28 | \n", "5.0 | \n", "12.0 | \n", "137.0 | \n", "1185.0 | \n", "
29 | \n", "1.0 | \n", "-1.0 | \n", "46.0 | \n", "272.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
5548724 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548725 | \n", "1.0 | \n", "-11.0 | \n", "110.0 | \n", "846.0 | \n", "
5548726 | \n", "3.0 | \n", "2.0 | \n", "77.0 | \n", "612.0 | \n", "
5548727 | \n", "2.0 | \n", "3.0 | \n", "155.0 | \n", "1087.0 | \n", "
5548728 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548729 | \n", "1.0 | \n", "8.0 | \n", "131.0 | \n", "984.0 | \n", "
5548730 | \n", "7.0 | \n", "31.0 | \n", "153.0 | \n", "1086.0 | \n", "
5548731 | \n", "4.0 | \n", "5.0 | \n", "91.0 | \n", "641.0 | \n", "
5548732 | \n", "7.0 | \n", "-13.0 | \n", "124.0 | \n", "1005.0 | \n", "
5548733 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548734 | \n", "2.0 | \n", "-7.0 | \n", "214.0 | \n", "1900.0 | \n", "
5548735 | \n", "5.0 | \n", "-7.0 | \n", "159.0 | \n", "1195.0 | \n", "
5548736 | \n", "1.0 | \n", "-1.0 | \n", "108.0 | \n", "773.0 | \n", "
5548737 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548738 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548739 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548740 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548741 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548742 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548743 | \n", "6.0 | \n", "-13.0 | \n", "40.0 | \n", "160.0 | \n", "
5548744 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548745 | \n", "1.0 | \n", "10.0 | \n", "63.0 | \n", "369.0 | \n", "
5548746 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548747 | \n", "5.0 | \n", "-6.0 | \n", "203.0 | \n", "1471.0 | \n", "
5548748 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548749 | \n", "3.0 | \n", "13.0 | \n", "59.0 | \n", "318.0 | \n", "
5548750 | \n", "1.0 | \n", "22.0 | \n", "34.0 | \n", "181.0 | \n", "
5548751 | \n", "1.0 | \n", "11.0 | \n", "71.0 | \n", "551.0 | \n", "
5548752 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5548753 | \n", "2.0 | \n", "-14.0 | \n", "107.0 | \n", "888.0 | \n", "
5548754 rows × 4 columns
\n", "