{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Working with Spark DataFrame\n", "\n", "## Feng Li\n", "\n", "### Central University of Finance and Economics\n", "\n", "### [feng.li@cufe.edu.cn](feng.li@cufe.edu.cn)\n", "### Course home page: [https://feng.li/distcomp](https://feng.li/distcomp)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Start a Spark Session" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/html": [ "\n", "
SparkSession - in-memory
\n", " \n", "SparkContext
\n", "\n", " \n", "\n", "v2.4.5local[*]Python Spark with DataFrame| \n", " | DayOfWeek | \n", "ArrDelay | \n", "AirTime | \n", "Distance | \n", "
|---|---|---|---|---|
| 0 | \n", "4.0 | \n", "2.0 | \n", "25.0 | \n", "127.0 | \n", "
| 1 | \n", "7.0 | \n", "29.0 | \n", "248.0 | \n", "1623.0 | \n", "
| 2 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 3 | \n", "5.0 | \n", "-2.0 | \n", "70.0 | \n", "451.0 | \n", "
| 4 | \n", "7.0 | \n", "11.0 | \n", "133.0 | \n", "1009.0 | \n", "
| 5 | \n", "7.0 | \n", "13.0 | \n", "177.0 | \n", "1562.0 | \n", "
| 6 | \n", "1.0 | \n", "-12.0 | \n", "181.0 | \n", "1589.0 | \n", "
| 7 | \n", "3.0 | \n", "11.0 | \n", "364.0 | \n", "2611.0 | \n", "
| 8 | \n", "5.0 | \n", "13.0 | \n", "53.0 | \n", "304.0 | \n", "
| 9 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 10 | \n", "5.0 | \n", "-8.0 | \n", "293.0 | \n", "2537.0 | \n", "
| 11 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 12 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 13 | \n", "2.0 | \n", "55.0 | \n", "285.0 | \n", "1927.0 | \n", "
| 14 | \n", "1.0 | \n", "23.0 | \n", "149.0 | \n", "991.0 | \n", "
| 15 | \n", "4.0 | \n", "64.0 | \n", "35.0 | \n", "193.0 | \n", "
| 16 | \n", "4.0 | \n", "29.0 | \n", "25.0 | \n", "77.0 | \n", "
| 17 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 18 | \n", "7.0 | \n", "-6.0 | \n", "91.0 | \n", "678.0 | \n", "
| 19 | \n", "7.0 | \n", "35.0 | \n", "127.0 | \n", "998.0 | \n", "
| 20 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 21 | \n", "2.0 | \n", "-7.0 | \n", "76.0 | \n", "508.0 | \n", "
| 22 | \n", "4.0 | \n", "60.0 | \n", "65.0 | \n", "370.0 | \n", "
| 23 | \n", "4.0 | \n", "-7.0 | \n", "66.0 | \n", "407.0 | \n", "
| 24 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 25 | \n", "3.0 | \n", "35.0 | \n", "313.0 | \n", "2421.0 | \n", "
| 26 | \n", "2.0 | \n", "-7.0 | \n", "137.0 | \n", "1121.0 | \n", "
| 27 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 28 | \n", "5.0 | \n", "12.0 | \n", "137.0 | \n", "1185.0 | \n", "
| 29 | \n", "1.0 | \n", "-1.0 | \n", "46.0 | \n", "272.0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 5548724 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548725 | \n", "1.0 | \n", "-11.0 | \n", "110.0 | \n", "846.0 | \n", "
| 5548726 | \n", "3.0 | \n", "2.0 | \n", "77.0 | \n", "612.0 | \n", "
| 5548727 | \n", "2.0 | \n", "3.0 | \n", "155.0 | \n", "1087.0 | \n", "
| 5548728 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548729 | \n", "1.0 | \n", "8.0 | \n", "131.0 | \n", "984.0 | \n", "
| 5548730 | \n", "7.0 | \n", "31.0 | \n", "153.0 | \n", "1086.0 | \n", "
| 5548731 | \n", "4.0 | \n", "5.0 | \n", "91.0 | \n", "641.0 | \n", "
| 5548732 | \n", "7.0 | \n", "-13.0 | \n", "124.0 | \n", "1005.0 | \n", "
| 5548733 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548734 | \n", "2.0 | \n", "-7.0 | \n", "214.0 | \n", "1900.0 | \n", "
| 5548735 | \n", "5.0 | \n", "-7.0 | \n", "159.0 | \n", "1195.0 | \n", "
| 5548736 | \n", "1.0 | \n", "-1.0 | \n", "108.0 | \n", "773.0 | \n", "
| 5548737 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548738 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548739 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548740 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548741 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548742 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548743 | \n", "6.0 | \n", "-13.0 | \n", "40.0 | \n", "160.0 | \n", "
| 5548744 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548745 | \n", "1.0 | \n", "10.0 | \n", "63.0 | \n", "369.0 | \n", "
| 5548746 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548747 | \n", "5.0 | \n", "-6.0 | \n", "203.0 | \n", "1471.0 | \n", "
| 5548748 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548749 | \n", "3.0 | \n", "13.0 | \n", "59.0 | \n", "318.0 | \n", "
| 5548750 | \n", "1.0 | \n", "22.0 | \n", "34.0 | \n", "181.0 | \n", "
| 5548751 | \n", "1.0 | \n", "11.0 | \n", "71.0 | \n", "551.0 | \n", "
| 5548752 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 5548753 | \n", "2.0 | \n", "-14.0 | \n", "107.0 | \n", "888.0 | \n", "
5548754 rows × 4 columns
\n", "