{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Working with Spark DataFrame\n", "\n", "## Feng Li\n", "\n", "### Central University of Finance and Economics\n", "\n", "### [feng.li@cufe.edu.cn](feng.li@cufe.edu.cn)\n", "### Course home page: [https://feng.li/distcomp](https://feng.li/distcomp)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Start a Spark Session" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "

SparkSession - in-memory

\n", " \n", "
\n", "

SparkContext

\n", "\n", "

Spark UI

\n", "\n", "
\n", "
Version
\n", "
v2.4.5
\n", "
Master
\n", "
local[*]
\n", "
AppName
\n", "
Python Spark with DataFrame
\n", "
\n", "
\n", " \n", "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import findspark\n", "findspark.init()\n", "import pyspark\n", "from pyspark.sql import SparkSession\n", "spark = SparkSession.builder\\\n", " .config(\"spark.executor.memory\", \"2g\")\\\n", " .config(\"spark.cores.max\", \"2\")\\\n", " .master(\"spark://master:7077\")\\\n", " .appName(\"Python Spark\").getOrCreate() # using spark server" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "**Note:**\n", "\n", "If you have problem to start pyspark interactive session due to system limitation. You could submit your spakr Job via the `spark-submit` command as below.\n", "\n", "\n", "```\n", "PYSPARK-PYTHON=python3.7 spark-submit \\\n", " --conf spark.ui.enabled=false \\\n", " your-pyspark-code.py \n", "```" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Read file and infer the schema from the header" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "## Load a file becasue we are using spark's master mode\n", "air0 = spark.read.options(header='true', inferSchema='true').csv(\"/data/airdelay_small.csv\") " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "DataFrame[Year: int, Month: int, DayofMonth: int, DayOfWeek: int, DepTime: string, CRSDepTime: int, ArrTime: string, CRSArrTime: int, UniqueCarrier: string, FlightNum: int, TailNum: string, ActualElapsedTime: string, CRSElapsedTime: string, AirTime: string, ArrDelay: string, DepDelay: string, Origin: string, Dest: string, Distance: string, TaxiIn: string, TaxiOut: string, Cancelled: int, CancellationCode: string, Diverted: int, CarrierDelay: string, WeatherDelay: string, NASDelay: string, SecurityDelay: string, LateAircraftDelay: string]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air0 # the schema is not correct for some variables" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "# We specify the correct schema by hand\n", "from pyspark.sql.types import *\n", "schema_sdf = StructType([\n", " StructField('Year', IntegerType(), True),\n", " StructField('Month', IntegerType(), True),\n", " StructField('DayofMonth', IntegerType(), True),\n", " StructField('DayOfWeek', IntegerType(), True),\n", " StructField('DepTime', DoubleType(), True),\n", " StructField('CRSDepTime', DoubleType(), True),\n", " StructField('ArrTime', DoubleType(), True),\n", " StructField('CRSArrTime', DoubleType(), True),\n", " StructField('UniqueCarrier', StringType(), True),\n", " StructField('FlightNum', StringType(), True),\n", " StructField('TailNum', StringType(), True),\n", " StructField('ActualElapsedTime', DoubleType(), True),\n", " StructField('CRSElapsedTime', DoubleType(), True),\n", " StructField('AirTime', DoubleType(), True),\n", " StructField('ArrDelay', DoubleType(), True),\n", " StructField('DepDelay', DoubleType(), True),\n", " StructField('Origin', Str ingType(), True),\n", " StructFi eld('Dest', StringType(), True),\n", " StructField('Distance', DoubleType(), True),\n", " StructField('TaxiIn', DoubleType(), True),\n", " StructField('TaxiOut', DoubleType(), True),\n", " StructField('Cancelled', IntegerType(), True),\n", " StructField('CancellationCode', StringType(), True),\n", " StructField('Diverted', IntegerType(), True),\n", " StructField('CarrierDelay', DoubleType(), True),\n", " StructField('WeatherDelay', DoubleType(), True),\n", " StructField('NASDelay', DoubleType(), True),\n", " StructField('SecurityDelay', DoubleType(), True),\n", " StructField('LateAircraftDelay', DoubleType(), True)\n", " ]) " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "air = spark.read.options(header='true').schema(schema_sdf).csv(\"/data/airdelay_small.csv\")\n", "air" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/plain": [ "DataFrame[Year: int, Month: int, DayofMonth: int, DayOfWeek: int, DepTime: double, CRSDepTime: double, ArrTime: double, CRSArrTime: double, UniqueCarrier: string, FlightNum: string, TailNum: string, ActualElapsedTime: double, CRSElapsedTime: double, AirTime: double, ArrDelay: double, DepDelay: double, Origin: string, Dest: string, Distance: double, TaxiIn: double, TaxiOut: double, Cancelled: int, CancellationCode: string, Diverted: int, CarrierDelay: double, WeatherDelay: double, NASDelay: double, SecurityDelay: double, LateAircraftDelay: double]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Descriptive Statistics" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-------------+------------------+-------+------------------+------------------+------------------+-----------------+------------------+-------+-------+-----------------+------------------+------------------+---------+----------------+--------+------------------+------------------+------------------+--------------------+------------------+\n", "|summary| Year| Month| DayofMonth| DayOfWeek| DepTime| CRSDepTime| ArrTime| CRSArrTime|UniqueCarrier| FlightNum|TailNum| ActualElapsedTime| CRSElapsedTime| AirTime| ArrDelay| DepDelay| Origin| Dest| Distance| TaxiIn| TaxiOut|Cancelled|CancellationCode|Diverted| CarrierDelay| WeatherDelay| NASDelay| SecurityDelay| LateAircraftDelay|\n", "+-------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-------------+------------------+-------+------------------+------------------+------------------+-----------------+------------------+-------+-------+-----------------+------------------+------------------+---------+----------------+--------+------------------+------------------+------------------+--------------------+------------------+\n", "| count| 1524481| 1524481| 1524481| 1524481| 1524481| 1524481| 1524481| 1524481| 1524481| 1524481|1524481| 1524481| 1524481| 1524481| 1524481| 1524481|1524481|1524481| 1524481| 1524481| 1524481| 1524481| 0| 1524481| 1524481| 1524481| 1524481| 1524481| 1524481|\n", "| mean|2005.2200309482375| 6.819439533847913|15.739877374660622|3.9425345412635515|1343.1007890554229|1335.312045870037|1490.5321424143692|1499.0464518744411| null|2100.4755290489024| 0.0|124.89908893584112|126.23595243233599|102.60629092786331|7.728503667805634| 9.065091660702889| null| null|723.8429826281863|7.4043402311999955|16.027061012895537| 0.0| null| 0.0|3.1917360728011697|0.6896806191746568| 3.555705187535955|0.025394216129948487| 4.100354809276075|\n", "| stddev|1.3298684774182468|3.3772885531435493| 8.788305183327493|1.9915258421755744|475.47666780749284|462.8261301202478| 499.1625758193302| 478.3461722588984| null| 1937.457867690337| 0.0| 70.9126133314637| 70.11716827588829| 78.48819536331416|35.41450193208633|32.169219586304244| null| null|571.7621650788815| 35.98689560676167|12.080643220148497| 0.0| null| 0.0| 18.41498545167556| 8.708781309481942|15.282413095867522| 1.1406729402330553|18.704288068396867|\n", "| min| 2003| 1| 1| 1| 1.0| 3.0| 1.0| 0.0| 9E| 1| 0| -681.0| 16.0| -1461.0| -735.0| -1197.0| ABE| ABE| 27.0| 0.0| 0.0| 0| null| 0| 0.0| 0.0| -13.0| 0.0| 0.0|\n", "| max| 2007| 12| 31| 7| 2644.0| 2359.0| 2742.0| 2359.0| YV| 999| n816ca| 1766.0| 660.0| 1936.0| 1779.0| 1752.0| YUM| YUM| 4962.0| 1470.0| 1439.0| 0| null| 0| 1665.0| 910.0| 1010.0| 382.0| 1060.0|\n", "+-------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-------------+------------------+-------+------------------+------------------+------------------+-----------------+------------------+-------+-------+-----------------+------------------+------------------+---------+----------------+--------+------------------+------------------+------------------+--------------------+------------------+\n", "\n" ] } ], "source": [ "air.describe().show()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+------------------+\n", "|summary| ArrDelay|\n", "+-------+------------------+\n", "| count| 5548754|\n", "| mean| 6.97897995898367|\n", "| stddev|30.191156753519472|\n", "| min| -1|\n", "| max| NA|\n", "+-------+------------------+\n", "\n" ] } ], "source": [ "air.describe(['ArrDelay']).show()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Print the schema in a tree format" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- Year: integer (nullable = true)\n", " |-- Month: integer (nullable = true)\n", " |-- DayofMonth: integer (nullable = true)\n", " |-- DayOfWeek: integer (nullable = true)\n", " |-- DepTime: string (nullable = true)\n", " |-- CRSDepTime: integer (nullable = true)\n", " |-- ArrTime: string (nullable = true)\n", " |-- CRSArrTime: integer (nullable = true)\n", " |-- UniqueCarrier: string (nullable = true)\n", " |-- FlightNum: integer (nullable = true)\n", " |-- TailNum: string (nullable = true)\n", " |-- ActualElapsedTime: string (nullable = true)\n", " |-- CRSElapsedTime: string (nullable = true)\n", " |-- AirTime: string (nullable = true)\n", " |-- ArrDelay: string (nullable = true)\n", " |-- DepDelay: string (nullable = true)\n", " |-- Origin: string (nullable = true)\n", " |-- Dest: string (nullable = true)\n", " |-- Distance: string (nullable = true)\n", " |-- TaxiIn: string (nullable = true)\n", " |-- TaxiOut: string (nullable = true)\n", " |-- Cancelled: integer (nullable = true)\n", " |-- CancellationCode: string (nullable = true)\n", " |-- Diverted: integer (nullable = true)\n", " |-- CarrierDelay: string (nullable = true)\n", " |-- WeatherDelay: string (nullable = true)\n", " |-- NASDelay: string (nullable = true)\n", " |-- SecurityDelay: string (nullable = true)\n", " |-- LateAircraftDelay: string (nullable = true)\n", "\n" ] } ], "source": [ "air.printSchema()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Select columns" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "scrolled": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+-------+--------+\n", "|ArrDelay|AirTime|Distance|\n", "+--------+-------+--------+\n", "| 2| 25| 127|\n", "| 29| 248| 1623|\n", "| 8| NA| 622|\n", "| -2| 70| 451|\n", "| 11| 133| 1009|\n", "| 13| 177| 1562|\n", "| -12| 181| 1589|\n", "| 11| 364| 2611|\n", "| 13| 53| 304|\n", "| 9| NA| 888|\n", "| -8| 293| 2537|\n", "| 15| NA| 1723|\n", "| -14| NA| 1736|\n", "| 55| 285| 1927|\n", "| 23| 149| 991|\n", "| 64| 35| 193|\n", "| 29| 25| 77|\n", "| -8| NA| 447|\n", "| -6| 91| 678|\n", "| 35| 127| 998|\n", "+--------+-------+--------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "air.select([\"ArrDelay\",\"AirTime\",\"Distance\"]).show()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------------+--------------+\n", "|UniqueCarrier|(ArrDelay > 0)|\n", "+-------------+--------------+\n", "| XE| true|\n", "| CO| true|\n", "| AA| true|\n", "| WN| false|\n", "| CO| true|\n", "| AA| true|\n", "| DL| false|\n", "| AA| true|\n", "| US| true|\n", "| AA| true|\n", "| AS| false|\n", "| UA| true|\n", "| TW| false|\n", "| NW| true|\n", "| NW| true|\n", "| AA| true|\n", "| DH| true|\n", "| WN| false|\n", "| AA| false|\n", "| CO| true|\n", "+-------------+--------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "air.select(air['UniqueCarrier'], air['ArrDelay']>0).show()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------------+---------+------+\n", "|UniqueCarrier|DayOfWeek| count|\n", "+-------------+---------+------+\n", "| PS| 6| 406|\n", "| CO| 4| 55764|\n", "| ML (1)| 7| 442|\n", "| XE| 4| 14896|\n", "| TZ| 4| 1455|\n", "| OO| 3| 17310|\n", "| EA| 7| 6197|\n", "| OO| 4| 17666|\n", "| F9| 2| 1679|\n", "| EA| 5| 6295|\n", "| HA| 5| 1519|\n", "| UA| 4| 89272|\n", "| EV| 4| 9729|\n", "| DL| 6|106031|\n", "| FL| 5| 6962|\n", "| YV| 3| 4165|\n", "| AQ| 2| 1035|\n", "| ML (1)| 2| 502|\n", "| DL| 3|110827|\n", "| YV| 6| 3828|\n", "+-------------+---------+------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# group data with respect to some columns \n", "air.groupBy([\"UniqueCarrier\",\"DayOfWeek\"]).count().show() " ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------------+------+\n", "|UniqueCarrier| count|\n", "+-------------+------+\n", "| DL|765388|\n", "| WN|703368|\n", "| AA|684522|\n", "| US|649056|\n", "| UA|611957|\n", "| NW|473820|\n", "| CO|373858|\n", "| TW|179081|\n", "| HP|173509|\n", "| MQ|164790|\n", "| AS|129863|\n", "| OO|120223|\n", "| XE| 94311|\n", "| EV| 67148|\n", "| OH| 60630|\n", "| FL| 47540|\n", "| EA| 43723|\n", "| PI| 41489|\n", "| DH| 32900|\n", "| B6| 29111|\n", "+-------------+------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "## Group and sort\n", "aircount=air.groupBy(\"UniqueCarrier\").count()\n", "aircount.sort(\"count\", ascending=False).show()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Data cleaning" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "## Returns a new DataFrame containing the distinct rows in this DataFrame.\n", "## Takes a while to compute\n", "\n", "## air.distinct().count()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+\n", "|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|\n", "+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+\n", "|1997| 3| 2| 7| 2019| 2015| 2314| 2245| CO| 143| N59302| 295| 270| 248| 29| 4| EWR| COS| 1623| 6| 41| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1994| 5| 2| 1| 700| 700| 804| 756| AA| 1629| NA| 124| 116| NA| 8| 0| BWI| ORD| 622| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1997| 2| 14| 5| 700| 700| 728| 730| WN| 1783| N332| 88| 90| 70| -2| 0| TUS| LAX| 451| 8| 10| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|2000| 6| 11| 7| 2052| 2034| 2132| 2121| CO| 1753| N16893| 160| 167| 133| 11| 18| IAH| PHX| 1009| 5| 22| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1997| 11| 16| 7| 1503| 1414| 1912| 1859| AA| 414| N205AA| 189| 225| 177| 13| 49| DFW| BOS| 1562| 4| 8| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1999| 5| 31| 1| 839| 845| 1409| 1421| DL| 152| N177DZ| 210| 216| 181| -12| -6| SLC| ATL| 1589| 11| 18| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1997| 4| 30| 3| 828| 830| 1206| 1155| AA| 11| N5DAAA| 398| 385| 364| 11| -2| BOS| LAX| 2611| 5| 29| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1995| 7| 21| 5| 1018| 1000| 1126| 1113| US| 272| N274US| 68| 73| 53| 13| 18| GSO| PIT| 304| 4| 11| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1989| 9| 28| 4| 1501| 1450| 1639| 1630| AA| 905| NA| 158| 160| NA| 9| 11| ORD| DEN| 888| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1993| 6| 28| 1| 1540| 1541| 2146| 2131| UA| 1746| NA| 246| 230| NA| 15| -1| SAN| ORD| 1723| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1994| 3| 31| 4| 1909| 1900| 2116| 2130| TW| 819| NA| 247| 270| NA| -14| 9| STL| SFO| 1736| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|2000| 5| 23| 2| 958| 910| 1205| 1110| NW| 281| N523US| 307| 300| 285| 55| 48| DTW| SEA| 1927| 8| 14| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1993| 3| 17| 3| 630| 630| 747| 755| WN| 1968| NA| 77| 85| NA| -8| 0| SFO| SAN| 447| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|2002| 8| 25| 7| 1255| 1250| 1552| 1558| AA| 768| N464AA| 117| 128| 91| -6| 5| ORD| PHL| 678| 7| 19| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1995| 7| 30| 7| 2018| 1955| 2318| 2243| CO| 765| N578PE| 180| 168| 127| 35| 23| EWR| TPA| 998| 3| 50| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1990| 7| 7| 6| 1140| 1140| 1325| 1330| WN| 619| NA| 105| 110| NA| -5| 0| HOU| STL| 687| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1999| 4| 20| 2| 1345| 1350| 1520| 1527| US| 1188| N428US| 95| 97| 76| -7| -5| CLT| TPA| 508| 2| 17| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1998| 4| 16| 4| 1838| 1800| 2015| 1915| WN| 1273| N319| 97| 75| 65| 60| 38| PHX| LAX| 370| 7| 25| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1991| 10| 16| 3| 955| 955| 1122| 1123| UA| 599| NA| 87| 88| NA| -1| 0| ORD| MSP| 334| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|2003| 4| 8| 2| 1642| 1644| 2022| 2029| AA| 909| N355AA| 160| 165| 137| -7| -2| DFW| MIA| 1121| 4| 19| 0| NA| 0| NA| NA| NA| NA| NA|\n", "+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "## Returns a new DataFrame omitting rows with null values\n", "air_without_na = air.na.drop()\n", "air_without_na.show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/plain": [ "4018338" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air_without_na.count()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "5548754" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air.count() # original file size" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+\n", "|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|\n", "+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+\n", "|2006| 7| 6| 4| 2055| 2055| 2150| 2148| XE| 2619| N11526| 55| 53| 25| 2| 0| IAH| LCH| 127| 8| 22| 0| unknown| 0| 0| 0| 0| 0| 0|\n", "|1997| 3| 2| 7| 2019| 2015| 2314| 2245| CO| 143| N59302| 295| 270| 248| 29| 4| EWR| COS| 1623| 6| 41| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1994| 5| 2| 1| 700| 700| 804| 756| AA| 1629| NA| 124| 116| NA| 8| 0| BWI| ORD| 622| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1997| 2| 14| 5| 700| 700| 728| 730| WN| 1783| N332| 88| 90| 70| -2| 0| TUS| LAX| 451| 8| 10| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|2000| 6| 11| 7| 2052| 2034| 2132| 2121| CO| 1753| N16893| 160| 167| 133| 11| 18| IAH| PHX| 1009| 5| 22| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1997| 11| 16| 7| 1503| 1414| 1912| 1859| AA| 414| N205AA| 189| 225| 177| 13| 49| DFW| BOS| 1562| 4| 8| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1999| 5| 31| 1| 839| 845| 1409| 1421| DL| 152| N177DZ| 210| 216| 181| -12| -6| SLC| ATL| 1589| 11| 18| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1997| 4| 30| 3| 828| 830| 1206| 1155| AA| 11| N5DAAA| 398| 385| 364| 11| -2| BOS| LAX| 2611| 5| 29| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1995| 7| 21| 5| 1018| 1000| 1126| 1113| US| 272| N274US| 68| 73| 53| 13| 18| GSO| PIT| 304| 4| 11| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1989| 9| 28| 4| 1501| 1450| 1639| 1630| AA| 905| NA| 158| 160| NA| 9| 11| ORD| DEN| 888| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|2007| 10| 26| 5| 654| 700| 1507| 1515| AS| 802| N648AS| 313| 315| 293| -8| -6| PDX| BOS| 2537| 9| 11| 0| unknown| 0| 0| 0| 0| 0| 0|\n", "|1993| 6| 28| 1| 1540| 1541| 2146| 2131| UA| 1746| NA| 246| 230| NA| 15| -1| SAN| ORD| 1723| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1994| 3| 31| 4| 1909| 1900| 2116| 2130| TW| 819| NA| 247| 270| NA| -14| 9| STL| SFO| 1736| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|2000| 5| 23| 2| 958| 910| 1205| 1110| NW| 281| N523US| 307| 300| 285| 55| 48| DTW| SEA| 1927| 8| 14| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|2004| 10| 18| 1| 939| 940| 1151| 1128| NW| 679| N317NB| 192| 168| 149| 23| -1| MSP| SLC| 991| 10| 33| 0| unknown| 0| 0| 0| 23| 0| 0|\n", "|2007| 5| 17| 4| 1944| 1830| 2034| 1930| AA| 1011| N3BDAA| 50| 60| 35| 64| 74| MIA| MCO| 193| 4| 11| 0| unknown| 0| 64| 0| 0| 0| 0|\n", "|2003| 7| 31| 4| 1738| 1710| 1819| 1750| DH| 7981| N309UE| 41| 40| 25| 29| 28| IAD| CHO| 77| 3| 12| 0| unknown| 0| 29| 0| 0| 0| 0|\n", "|1993| 3| 17| 3| 630| 630| 747| 755| WN| 1968| NA| 77| 85| NA| -8| 0| SFO| SAN| 447| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|2002| 8| 25| 7| 1255| 1250| 1552| 1558| AA| 768| N464AA| 117| 128| 91| -6| 5| ORD| PHL| 678| 7| 19| 0| NA| 0| NA| NA| NA| NA| NA|\n", "|1995| 7| 30| 7| 2018| 1955| 2318| 2243| CO| 765| N578PE| 180| 168| 127| 35| 23| EWR| TPA| 998| 3| 50| 0| NA| 0| NA| NA| NA| NA| NA|\n", "+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "## Replace null values\n", "air.na.fill(\"unknown\").show()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+-------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+\n", "|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance| TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|\n", "+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+-------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+\n", "|2006| 7| 6| 4| 2055| 2055| 2150| 2148| XE| 2619| N11526| 55| 53| 25| 2| 0| IAH| LCH| 127| 8| 22| 0| null| 0| 0| 0| 0| 0| 0|\n", "|1997| 3| 2| 7| 2019| 2015| 2314| 2245| CO| 143| N59302| 295| 270| 248| 29| 4| EWR| COS| 1623| 6| 41| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|1994| 5| 2| 1| 700| 700| 804| 756| AA| 1629|unknown| 124| 116|unknown| 8| 0| BWI| ORD| 622|unknown|unknown| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|1997| 2| 14| 5| 700| 700| 728| 730| WN| 1783| N332| 88| 90| 70| -2| 0| TUS| LAX| 451| 8| 10| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|2000| 6| 11| 7| 2052| 2034| 2132| 2121| CO| 1753| N16893| 160| 167| 133| 11| 18| IAH| PHX| 1009| 5| 22| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|1997| 11| 16| 7| 1503| 1414| 1912| 1859| AA| 414| N205AA| 189| 225| 177| 13| 49| DFW| BOS| 1562| 4| 8| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|1999| 5| 31| 1| 839| 845| 1409| 1421| DL| 152| N177DZ| 210| 216| 181| -12| -6| SLC| ATL| 1589| 11| 18| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|1997| 4| 30| 3| 828| 830| 1206| 1155| AA| 11| N5DAAA| 398| 385| 364| 11| -2| BOS| LAX| 2611| 5| 29| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|1995| 7| 21| 5| 1018| 1000| 1126| 1113| US| 272| N274US| 68| 73| 53| 13| 18| GSO| PIT| 304| 4| 11| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|1989| 9| 28| 4| 1501| 1450| 1639| 1630| AA| 905|unknown| 158| 160|unknown| 9| 11| ORD| DEN| 888|unknown|unknown| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|2007| 10| 26| 5| 654| 700| 1507| 1515| AS| 802| N648AS| 313| 315| 293| -8| -6| PDX| BOS| 2537| 9| 11| 0| null| 0| 0| 0| 0| 0| 0|\n", "|1993| 6| 28| 1| 1540| 1541| 2146| 2131| UA| 1746|unknown| 246| 230|unknown| 15| -1| SAN| ORD| 1723|unknown|unknown| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|1994| 3| 31| 4| 1909| 1900| 2116| 2130| TW| 819|unknown| 247| 270|unknown| -14| 9| STL| SFO| 1736|unknown|unknown| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|2000| 5| 23| 2| 958| 910| 1205| 1110| NW| 281| N523US| 307| 300| 285| 55| 48| DTW| SEA| 1927| 8| 14| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|2004| 10| 18| 1| 939| 940| 1151| 1128| NW| 679| N317NB| 192| 168| 149| 23| -1| MSP| SLC| 991| 10| 33| 0| null| 0| 0| 0| 23| 0| 0|\n", "|2007| 5| 17| 4| 1944| 1830| 2034| 1930| AA| 1011| N3BDAA| 50| 60| 35| 64| 74| MIA| MCO| 193| 4| 11| 0| null| 0| 64| 0| 0| 0| 0|\n", "|2003| 7| 31| 4| 1738| 1710| 1819| 1750| DH| 7981| N309UE| 41| 40| 25| 29| 28| IAD| CHO| 77| 3| 12| 0| null| 0| 29| 0| 0| 0| 0|\n", "|1993| 3| 17| 3| 630| 630| 747| 755| WN| 1968|unknown| 77| 85|unknown| -8| 0| SFO| SAN| 447|unknown|unknown| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|2002| 8| 25| 7| 1255| 1250| 1552| 1558| AA| 768| N464AA| 117| 128| 91| -6| 5| ORD| PHL| 678| 7| 19| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "|1995| 7| 30| 7| 2018| 1955| 2318| 2243| CO| 765| N578PE| 180| 168| 127| 35| 23| EWR| TPA| 998| 3| 50| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown|\n", "+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+-------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "air.na.replace('NA', \"unknown\").show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "## Reload a local file becasue we changed the NA values\n", "air = spark.read.options(header='true', inferSchema='true').csv(\"/data/airdelay/airdelay_small.csv\") " ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/plain": [ "[Row(max(DayOfWeek)=7)]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air.groupBy().max('DayOfWeek').collect() # mus apply to a numeric column" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Statistics" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "0.008481756987561134" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air.corr(\"Distance\",\"ArrDelay\")" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "140.5795326021564" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air.cov(\"Distance\",\"ArrDelay\")" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "scrolled": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+\n", "|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|\n", "+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+\n", "|2007| 5| 17| 4| 1944.0| 1830.0| 2034.0| 1930.0| AA| 1011| N3BDAA| 50.0| 60.0| 35.0| 64.0| 74.0| MIA| MCO| 193.0| 4.0| 11.0| 0| null| 0| 64.0| 0.0| 0.0| 0.0| 0.0|\n", "|2004| 4| 16| 5| 1550.0| 1525.0| 1750.0| 1638.0| XE| 2602| N15948| 120.0| 73.0| 35.0| 72.0| 25.0| EWR| BWI| 169.0| 5.0| 80.0| 0| null| 0| 0.0| 0.0| 51.0| 0.0| 21.0|\n", "|2005| 7| 1| 5| 1931.0| 1723.0| 2235.0| 2017.0| FL| 831| N978AT| 124.0| 114.0| 86.0| 138.0| 128.0| MDW| ATL| 590.0| 9.0| 29.0| 0| null| 0| 0.0| 0.0| 10.0| 0.0| 128.0|\n", "|2005| 11| 22| 2| 1849.0| 1705.0| 2034.0| 1815.0| B6| 55| N585JB| 105.0| 70.0| 56.0| 139.0| 104.0| BTV| JFK| 267.0| 4.0| 45.0| 0| null| 0| 0.0| 0.0| 35.0| 0.0| 104.0|\n", "|2007| 10| 21| 7| 1725.0| 1625.0| 1837.0| 1730.0| WN| 143| N389SW| 72.0| 65.0| 42.0| 67.0| 60.0| LAS| LAX| 236.0| 12.0| 18.0| 0| null| 0| 0.0| 0.0| 7.0| 0.0| 60.0|\n", "|2005| 4| 15| 5| 1519.0| 1310.0| 1751.0| 1545.0| AS| 631| N947AS| 152.0| 155.0| 136.0| 126.0| 129.0| LAS| SEA| 866.0| 5.0| 11.0| 0| null| 0| 4.0| 0.0| 0.0| 0.0| 122.0|\n", "|2006| 1| 24| 2| 1230.0| 1115.0| 1343.0| 1228.0| OH| 5050| N784CA| 73.0| 73.0| 50.0| 75.0| 75.0| JFK| BOS| 187.0| 5.0| 18.0| 0| null| 0| 75.0| 0.0| 0.0| 0.0| 0.0|\n", "|2007| 6| 16| 6| 1525.0| 1155.0| 1636.0| 1255.0| XE| 2317| N14933| 71.0| 60.0| 44.0| 221.0| 210.0| LFT| IAH| 201.0| 10.0| 17.0| 0| null| 0| 0.0| 0.0| 104.0| 0.0| 117.0|\n", "|2006| 12| 1| 5| 1016.0| 720.0| 1226.0| 925.0| MQ| 3484| N902BC| 190.0| 185.0| 167.0| 181.0| 176.0| CHS| DFW| 987.0| 11.0| 12.0| 0| null| 0| 176.0| 0.0| 5.0| 0.0| 0.0|\n", "|2007| 8| 25| 6| 1544.0| 1430.0| 2123.0| 2015.0| AA| 2073| N612AA| 279.0| 285.0| 256.0| 68.0| 74.0| ORD| SJU| 2072.0| 4.0| 19.0| 0| null| 0| 68.0| 0.0| 0.0| 0.0| 0.0|\n", "|2006| 7| 20| 4| 1102.0| 834.0| 1445.0| 1225.0| AA| 1652| N441AA| 163.0| 171.0| 144.0| 140.0| 148.0| ORD| RSW| 1120.0| 2.0| 17.0| 0| null| 0| 0.0| 140.0| 0.0| 0.0| 0.0|\n", "|2006| 7| 23| 7| 2110.0| 1920.0| 2228.0| 2035.0| WN| 2347| N433| 78.0| 75.0| 57.0| 113.0| 110.0| SNA| OAK| 371.0| 6.0| 15.0| 0| null| 0| 0.0| 0.0| 3.0| 4.0| 106.0|\n", "|2007| 1| 14| 7| 1110.0| 850.0| 1216.0| 1010.0| 9E| 5968| 85889E| 66.0| 80.0| 44.0| 126.0| 140.0| MEM| STL| 256.0| 5.0| 17.0| 0| null| 0| 126.0| 0.0| 0.0| 0.0| 0.0|\n", "|2004| 1| 11| 7| 1530.0| 1255.0| 1815.0| 1545.0| WN| 1405| N630| 105.0| 110.0| 94.0| 150.0| 155.0| OAK| PHX| 646.0| 3.0| 8.0| 0| null| 0| 150.0| 0.0| 0.0| 0.0| 0.0|\n", "|2007| 10| 26| 5| 1603.0| 1448.0| 1900.0| 1752.0| UA| 270| N535UA| 117.0| 124.0| 105.0| 68.0| 75.0| SNA| DEN| 846.0| 5.0| 7.0| 0| null| 0| 0.0| 0.0| 0.0| 0.0| 68.0|\n", "|2005| 10| 31| 1| 854.0| 858.0| 1136.0| 1000.0| MQ| 3384| N852AE| 162.0| 62.0| 41.0| 96.0| -4.0| SJT| DFW| 228.0| 109.0| 12.0| 0| null| 0| 0.0| 0.0| 96.0| 0.0| 0.0|\n", "|2006| 1| 6| 5| 2200.0| 2030.0| 2320.0| 2214.0| OH| 5830| N408CA| 140.0| 164.0| 119.0| 66.0| 90.0| LGA| BHM| 866.0| 5.0| 16.0| 0| null| 0| 0.0| 0.0| 0.0| 0.0| 66.0|\n", "|2005| 6| 1| 3| 1729.0| 1610.0| 1847.0| 1740.0| AA| 1966| N4UAAA| 78.0| 90.0| 62.0| 67.0| 79.0| DFW| MCI| 460.0| 3.0| 13.0| 0| null| 0| 67.0| 0.0| 0.0| 0.0| 0.0|\n", "|2007| 7| 13| 5| 1840.0| 1745.0| 2007.0| 1905.0| EV| 4263| N879AS| 147.0| 140.0| 125.0| 62.0| 55.0| DCA| JAN| 860.0| 6.0| 16.0| 0| null| 0| 55.0| 0.0| 7.0| 0.0| 0.0|\n", "|2006| 2| 26| 7| 1953.0| 1745.0| 2145.0| 1940.0| OO| 6765| N750SK| 112.0| 115.0| 92.0| 125.0| 128.0| DEN| TUS| 639.0| 6.0| 14.0| 0| null| 0| 125.0| 0.0| 0.0| 0.0| 0.0|\n", "+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "air.filter(air.ArrDelay > 60).show() # filter with certain conditions " ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### User-defined functions" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "## air2 = air.select([\"DayOfWeek\",\"ArrDelay\",\"AirTime\",\"Distance\"])\n", "air2_pdf = air.select([\"DayOfWeek\", \"ArrDelay\",\"AirTime\",\"Distance\"]).toPandas()" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "scrolled": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DayOfWeekArrDelayAirTimeDistance
04.02.025.0127.0
17.029.0248.01623.0
2NaNNaNNaNNaN
35.0-2.070.0451.0
47.011.0133.01009.0
57.013.0177.01562.0
61.0-12.0181.01589.0
73.011.0364.02611.0
85.013.053.0304.0
9NaNNaNNaNNaN
105.0-8.0293.02537.0
11NaNNaNNaNNaN
12NaNNaNNaNNaN
132.055.0285.01927.0
141.023.0149.0991.0
154.064.035.0193.0
164.029.025.077.0
17NaNNaNNaNNaN
187.0-6.091.0678.0
197.035.0127.0998.0
20NaNNaNNaNNaN
212.0-7.076.0508.0
224.060.065.0370.0
234.0-7.066.0407.0
24NaNNaNNaNNaN
253.035.0313.02421.0
262.0-7.0137.01121.0
27NaNNaNNaNNaN
285.012.0137.01185.0
291.0-1.046.0272.0
...............
5548724NaNNaNNaNNaN
55487251.0-11.0110.0846.0
55487263.02.077.0612.0
55487272.03.0155.01087.0
5548728NaNNaNNaNNaN
55487291.08.0131.0984.0
55487307.031.0153.01086.0
55487314.05.091.0641.0
55487327.0-13.0124.01005.0
5548733NaNNaNNaNNaN
55487342.0-7.0214.01900.0
55487355.0-7.0159.01195.0
55487361.0-1.0108.0773.0
5548737NaNNaNNaNNaN
5548738NaNNaNNaNNaN
5548739NaNNaNNaNNaN
5548740NaNNaNNaNNaN
5548741NaNNaNNaNNaN
5548742NaNNaNNaNNaN
55487436.0-13.040.0160.0
5548744NaNNaNNaNNaN
55487451.010.063.0369.0
5548746NaNNaNNaNNaN
55487475.0-6.0203.01471.0
5548748NaNNaNNaNNaN
55487493.013.059.0318.0
55487501.022.034.0181.0
55487511.011.071.0551.0
5548752NaNNaNNaNNaN
55487532.0-14.0107.0888.0
\n", "

5548754 rows × 4 columns

\n", "
" ], "text/plain": [ " DayOfWeek ArrDelay AirTime Distance\n", "0 4.0 2.0 25.0 127.0\n", "1 7.0 29.0 248.0 1623.0\n", "2 NaN NaN NaN NaN\n", "3 5.0 -2.0 70.0 451.0\n", "4 7.0 11.0 133.0 1009.0\n", "5 7.0 13.0 177.0 1562.0\n", "6 1.0 -12.0 181.0 1589.0\n", "7 3.0 11.0 364.0 2611.0\n", "8 5.0 13.0 53.0 304.0\n", "9 NaN NaN NaN NaN\n", "10 5.0 -8.0 293.0 2537.0\n", "11 NaN NaN NaN NaN\n", "12 NaN NaN NaN NaN\n", "13 2.0 55.0 285.0 1927.0\n", "14 1.0 23.0 149.0 991.0\n", "15 4.0 64.0 35.0 193.0\n", "16 4.0 29.0 25.0 77.0\n", "17 NaN NaN NaN NaN\n", "18 7.0 -6.0 91.0 678.0\n", "19 7.0 35.0 127.0 998.0\n", "20 NaN NaN NaN NaN\n", "21 2.0 -7.0 76.0 508.0\n", "22 4.0 60.0 65.0 370.0\n", "23 4.0 -7.0 66.0 407.0\n", "24 NaN NaN NaN NaN\n", "25 3.0 35.0 313.0 2421.0\n", "26 2.0 -7.0 137.0 1121.0\n", "27 NaN NaN NaN NaN\n", "28 5.0 12.0 137.0 1185.0\n", "29 1.0 -1.0 46.0 272.0\n", "... ... ... ... ...\n", "5548724 NaN NaN NaN NaN\n", "5548725 1.0 -11.0 110.0 846.0\n", "5548726 3.0 2.0 77.0 612.0\n", "5548727 2.0 3.0 155.0 1087.0\n", "5548728 NaN NaN NaN NaN\n", "5548729 1.0 8.0 131.0 984.0\n", "5548730 7.0 31.0 153.0 1086.0\n", "5548731 4.0 5.0 91.0 641.0\n", "5548732 7.0 -13.0 124.0 1005.0\n", "5548733 NaN NaN NaN NaN\n", "5548734 2.0 -7.0 214.0 1900.0\n", "5548735 5.0 -7.0 159.0 1195.0\n", "5548736 1.0 -1.0 108.0 773.0\n", "5548737 NaN NaN NaN NaN\n", "5548738 NaN NaN NaN NaN\n", "5548739 NaN NaN NaN NaN\n", "5548740 NaN NaN NaN NaN\n", "5548741 NaN NaN NaN NaN\n", "5548742 NaN NaN NaN NaN\n", "5548743 6.0 -13.0 40.0 160.0\n", "5548744 NaN NaN NaN NaN\n", "5548745 1.0 10.0 63.0 369.0\n", "5548746 NaN NaN NaN NaN\n", "5548747 5.0 -6.0 203.0 1471.0\n", "5548748 NaN NaN NaN NaN\n", "5548749 3.0 13.0 59.0 318.0\n", "5548750 1.0 22.0 34.0 181.0\n", "5548751 1.0 11.0 71.0 551.0\n", "5548752 NaN NaN NaN NaN\n", "5548753 2.0 -14.0 107.0 888.0\n", "\n", "[5548754 rows x 4 columns]" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "air2_pdf" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "def myfun(pdf):\n", " out = dict() \n", " out[\"ArrDelay\"] = pdf.ArrDelay.mean()\n", " out[\"AirTime\"] = pdf.AirTime.mean()\n", " out[\"Distance\"] = pdf.Distance.mean()\n", " \n", " return pd.DataFrame(out, index=[0])\n", "\n", "myfun(air2_pdf)" ] }, { "cell_type": "code", "execution_count": 119, "metadata": { "scrolled": false, "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "import pandas as pd\n", "from pyspark.sql.functions import pandas_udf, PandasUDFType\n", "\n", "@pandas_udf(\"ArrDelay long, AirTime long, Distance long\", PandasUDFType .GROUPED_MAP) \n", "def myfun(pdf):\n", " out = dict() \n", " out[\"ArrDelay\"] = pdf.ArrDelay.mean()\n", " out[\"AirTime\"] = pdf.AirTime.mean()\n", " out[\"Distance\"] = pdf.Distance.mean()\n", " \n", " return pd.DataFrame(out, index=[0])" ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "scrolled": true, "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "air2 = air.select([\"DayOfWeek\",\"ArrDelay\",\"AirTime\",\"Distance\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "# air3 = air2.na.drop()\n", "# air3.groupby(\"DayOfWeek\").apply(myfun).show()" ] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.17" }, "rise": { "auto_select": "first", "autolaunch": false, "chalkboard": { "chalkEffect": 1, "chalkWidth": 4, "theme": "whiteboard", "transition": 800 }, "enable_chalkboard": true, "reveal_shortcuts": { "chalkboard": { "clear": "ctrl-k" } }, "start_slideshow_at": "selected", "theme": "black" } }, "nbformat": 4, "nbformat_minor": 4 }