import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.config("spark.executor.memory", "2g")\
.config("spark.cores.max", "2")\
.master("spark://master:7077")\
.appName("Python Spark").getOrCreate() # using spark server
SparkSession - in-memory
Note:
If you have problem to start pyspark interactive session due to system limitation. You could submit your spakr Job via the spark-submit
command as below.
PYSPARK-PYTHON=python3.7 spark-submit \
--conf spark.ui.enabled=false \
your-pyspark-code.py
## Load a file becasue we are using spark's master mode
air0 = spark.read.options(header='true', inferSchema='true').csv("/data/airdelay_small.csv")
air0 # the schema is not correct for some variables
DataFrame[Year: int, Month: int, DayofMonth: int, DayOfWeek: int, DepTime: string, CRSDepTime: int, ArrTime: string, CRSArrTime: int, UniqueCarrier: string, FlightNum: int, TailNum: string, ActualElapsedTime: string, CRSElapsedTime: string, AirTime: string, ArrDelay: string, DepDelay: string, Origin: string, Dest: string, Distance: string, TaxiIn: string, TaxiOut: string, Cancelled: int, CancellationCode: string, Diverted: int, CarrierDelay: string, WeatherDelay: string, NASDelay: string, SecurityDelay: string, LateAircraftDelay: string]
# We specify the correct schema by hand
from pyspark.sql.types import *
schema_sdf = StructType([
StructField('Year', IntegerType(), True),
StructField('Month', IntegerType(), True),
StructField('DayofMonth', IntegerType(), True),
StructField('DayOfWeek', IntegerType(), True),
StructField('DepTime', DoubleType(), True),
StructField('CRSDepTime', DoubleType(), True),
StructField('ArrTime', DoubleType(), True),
StructField('CRSArrTime', DoubleType(), True),
StructField('UniqueCarrier', StringType(), True),
StructField('FlightNum', StringType(), True),
StructField('TailNum', StringType(), True),
StructField('ActualElapsedTime', DoubleType(), True),
StructField('CRSElapsedTime', DoubleType(), True),
StructField('AirTime', DoubleType(), True),
StructField('ArrDelay', DoubleType(), True),
StructField('DepDelay', DoubleType(), True),
StructField('Origin', Str ingType(), True),
StructFi eld('Dest', StringType(), True),
StructField('Distance', DoubleType(), True),
StructField('TaxiIn', DoubleType(), True),
StructField('TaxiOut', DoubleType(), True),
StructField('Cancelled', IntegerType(), True),
StructField('CancellationCode', StringType(), True),
StructField('Diverted', IntegerType(), True),
StructField('CarrierDelay', DoubleType(), True),
StructField('WeatherDelay', DoubleType(), True),
StructField('NASDelay', DoubleType(), True),
StructField('SecurityDelay', DoubleType(), True),
StructField('LateAircraftDelay', DoubleType(), True)
])
air = spark.read.options(header='true').schema(schema_sdf).csv("/data/airdelay_small.csv")
air
air
DataFrame[Year: int, Month: int, DayofMonth: int, DayOfWeek: int, DepTime: double, CRSDepTime: double, ArrTime: double, CRSArrTime: double, UniqueCarrier: string, FlightNum: string, TailNum: string, ActualElapsedTime: double, CRSElapsedTime: double, AirTime: double, ArrDelay: double, DepDelay: double, Origin: string, Dest: string, Distance: double, TaxiIn: double, TaxiOut: double, Cancelled: int, CancellationCode: string, Diverted: int, CarrierDelay: double, WeatherDelay: double, NASDelay: double, SecurityDelay: double, LateAircraftDelay: double]
air.describe().show()
+-------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-------------+------------------+-------+------------------+------------------+------------------+-----------------+------------------+-------+-------+-----------------+------------------+------------------+---------+----------------+--------+------------------+------------------+------------------+--------------------+------------------+ |summary| Year| Month| DayofMonth| DayOfWeek| DepTime| CRSDepTime| ArrTime| CRSArrTime|UniqueCarrier| FlightNum|TailNum| ActualElapsedTime| CRSElapsedTime| AirTime| ArrDelay| DepDelay| Origin| Dest| Distance| TaxiIn| TaxiOut|Cancelled|CancellationCode|Diverted| CarrierDelay| WeatherDelay| NASDelay| SecurityDelay| LateAircraftDelay| +-------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-------------+------------------+-------+------------------+------------------+------------------+-----------------+------------------+-------+-------+-----------------+------------------+------------------+---------+----------------+--------+------------------+------------------+------------------+--------------------+------------------+ | count| 1524481| 1524481| 1524481| 1524481| 1524481| 1524481| 1524481| 1524481| 1524481| 1524481|1524481| 1524481| 1524481| 1524481| 1524481| 1524481|1524481|1524481| 1524481| 1524481| 1524481| 1524481| 0| 1524481| 1524481| 1524481| 1524481| 1524481| 1524481| | mean|2005.2200309482375| 6.819439533847913|15.739877374660622|3.9425345412635515|1343.1007890554229|1335.312045870037|1490.5321424143692|1499.0464518744411| null|2100.4755290489024| 0.0|124.89908893584112|126.23595243233599|102.60629092786331|7.728503667805634| 9.065091660702889| null| null|723.8429826281863|7.4043402311999955|16.027061012895537| 0.0| null| 0.0|3.1917360728011697|0.6896806191746568| 3.555705187535955|0.025394216129948487| 4.100354809276075| | stddev|1.3298684774182468|3.3772885531435493| 8.788305183327493|1.9915258421755744|475.47666780749284|462.8261301202478| 499.1625758193302| 478.3461722588984| null| 1937.457867690337| 0.0| 70.9126133314637| 70.11716827588829| 78.48819536331416|35.41450193208633|32.169219586304244| null| null|571.7621650788815| 35.98689560676167|12.080643220148497| 0.0| null| 0.0| 18.41498545167556| 8.708781309481942|15.282413095867522| 1.1406729402330553|18.704288068396867| | min| 2003| 1| 1| 1| 1.0| 3.0| 1.0| 0.0| 9E| 1| 0| -681.0| 16.0| -1461.0| -735.0| -1197.0| ABE| ABE| 27.0| 0.0| 0.0| 0| null| 0| 0.0| 0.0| -13.0| 0.0| 0.0| | max| 2007| 12| 31| 7| 2644.0| 2359.0| 2742.0| 2359.0| YV| 999| n816ca| 1766.0| 660.0| 1936.0| 1779.0| 1752.0| YUM| YUM| 4962.0| 1470.0| 1439.0| 0| null| 0| 1665.0| 910.0| 1010.0| 382.0| 1060.0| +-------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-------------+------------------+-------+------------------+------------------+------------------+-----------------+------------------+-------+-------+-----------------+------------------+------------------+---------+----------------+--------+------------------+------------------+------------------+--------------------+------------------+
air.describe(['ArrDelay']).show()
+-------+------------------+ |summary| ArrDelay| +-------+------------------+ | count| 5548754| | mean| 6.97897995898367| | stddev|30.191156753519472| | min| -1| | max| NA| +-------+------------------+
air.printSchema()
root |-- Year: integer (nullable = true) |-- Month: integer (nullable = true) |-- DayofMonth: integer (nullable = true) |-- DayOfWeek: integer (nullable = true) |-- DepTime: string (nullable = true) |-- CRSDepTime: integer (nullable = true) |-- ArrTime: string (nullable = true) |-- CRSArrTime: integer (nullable = true) |-- UniqueCarrier: string (nullable = true) |-- FlightNum: integer (nullable = true) |-- TailNum: string (nullable = true) |-- ActualElapsedTime: string (nullable = true) |-- CRSElapsedTime: string (nullable = true) |-- AirTime: string (nullable = true) |-- ArrDelay: string (nullable = true) |-- DepDelay: string (nullable = true) |-- Origin: string (nullable = true) |-- Dest: string (nullable = true) |-- Distance: string (nullable = true) |-- TaxiIn: string (nullable = true) |-- TaxiOut: string (nullable = true) |-- Cancelled: integer (nullable = true) |-- CancellationCode: string (nullable = true) |-- Diverted: integer (nullable = true) |-- CarrierDelay: string (nullable = true) |-- WeatherDelay: string (nullable = true) |-- NASDelay: string (nullable = true) |-- SecurityDelay: string (nullable = true) |-- LateAircraftDelay: string (nullable = true)
air.select(["ArrDelay","AirTime","Distance"]).show()
+--------+-------+--------+ |ArrDelay|AirTime|Distance| +--------+-------+--------+ | 2| 25| 127| | 29| 248| 1623| | 8| NA| 622| | -2| 70| 451| | 11| 133| 1009| | 13| 177| 1562| | -12| 181| 1589| | 11| 364| 2611| | 13| 53| 304| | 9| NA| 888| | -8| 293| 2537| | 15| NA| 1723| | -14| NA| 1736| | 55| 285| 1927| | 23| 149| 991| | 64| 35| 193| | 29| 25| 77| | -8| NA| 447| | -6| 91| 678| | 35| 127| 998| +--------+-------+--------+ only showing top 20 rows
air.select(air['UniqueCarrier'], air['ArrDelay']>0).show()
+-------------+--------------+ |UniqueCarrier|(ArrDelay > 0)| +-------------+--------------+ | XE| true| | CO| true| | AA| true| | WN| false| | CO| true| | AA| true| | DL| false| | AA| true| | US| true| | AA| true| | AS| false| | UA| true| | TW| false| | NW| true| | NW| true| | AA| true| | DH| true| | WN| false| | AA| false| | CO| true| +-------------+--------------+ only showing top 20 rows
# group data with respect to some columns
air.groupBy(["UniqueCarrier","DayOfWeek"]).count().show()
+-------------+---------+------+ |UniqueCarrier|DayOfWeek| count| +-------------+---------+------+ | PS| 6| 406| | CO| 4| 55764| | ML (1)| 7| 442| | XE| 4| 14896| | TZ| 4| 1455| | OO| 3| 17310| | EA| 7| 6197| | OO| 4| 17666| | F9| 2| 1679| | EA| 5| 6295| | HA| 5| 1519| | UA| 4| 89272| | EV| 4| 9729| | DL| 6|106031| | FL| 5| 6962| | YV| 3| 4165| | AQ| 2| 1035| | ML (1)| 2| 502| | DL| 3|110827| | YV| 6| 3828| +-------------+---------+------+ only showing top 20 rows
## Group and sort
aircount=air.groupBy("UniqueCarrier").count()
aircount.sort("count", ascending=False).show()
+-------------+------+ |UniqueCarrier| count| +-------------+------+ | DL|765388| | WN|703368| | AA|684522| | US|649056| | UA|611957| | NW|473820| | CO|373858| | TW|179081| | HP|173509| | MQ|164790| | AS|129863| | OO|120223| | XE| 94311| | EV| 67148| | OH| 60630| | FL| 47540| | EA| 43723| | PI| 41489| | DH| 32900| | B6| 29111| +-------------+------+ only showing top 20 rows
## Returns a new DataFrame containing the distinct rows in this DataFrame.
## Takes a while to compute
## air.distinct().count()
## Returns a new DataFrame omitting rows with null values
air_without_na = air.na.drop()
air_without_na.show()
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+ |Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay| +----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+ |1997| 3| 2| 7| 2019| 2015| 2314| 2245| CO| 143| N59302| 295| 270| 248| 29| 4| EWR| COS| 1623| 6| 41| 0| NA| 0| NA| NA| NA| NA| NA| |1994| 5| 2| 1| 700| 700| 804| 756| AA| 1629| NA| 124| 116| NA| 8| 0| BWI| ORD| 622| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA| |1997| 2| 14| 5| 700| 700| 728| 730| WN| 1783| N332| 88| 90| 70| -2| 0| TUS| LAX| 451| 8| 10| 0| NA| 0| NA| NA| NA| NA| NA| |2000| 6| 11| 7| 2052| 2034| 2132| 2121| CO| 1753| N16893| 160| 167| 133| 11| 18| IAH| PHX| 1009| 5| 22| 0| NA| 0| NA| NA| NA| NA| NA| |1997| 11| 16| 7| 1503| 1414| 1912| 1859| AA| 414| N205AA| 189| 225| 177| 13| 49| DFW| BOS| 1562| 4| 8| 0| NA| 0| NA| NA| NA| NA| NA| |1999| 5| 31| 1| 839| 845| 1409| 1421| DL| 152| N177DZ| 210| 216| 181| -12| -6| SLC| ATL| 1589| 11| 18| 0| NA| 0| NA| NA| NA| NA| NA| |1997| 4| 30| 3| 828| 830| 1206| 1155| AA| 11| N5DAAA| 398| 385| 364| 11| -2| BOS| LAX| 2611| 5| 29| 0| NA| 0| NA| NA| NA| NA| NA| |1995| 7| 21| 5| 1018| 1000| 1126| 1113| US| 272| N274US| 68| 73| 53| 13| 18| GSO| PIT| 304| 4| 11| 0| NA| 0| NA| NA| NA| NA| NA| |1989| 9| 28| 4| 1501| 1450| 1639| 1630| AA| 905| NA| 158| 160| NA| 9| 11| ORD| DEN| 888| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA| |1993| 6| 28| 1| 1540| 1541| 2146| 2131| UA| 1746| NA| 246| 230| NA| 15| -1| SAN| ORD| 1723| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA| |1994| 3| 31| 4| 1909| 1900| 2116| 2130| TW| 819| NA| 247| 270| NA| -14| 9| STL| SFO| 1736| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA| |2000| 5| 23| 2| 958| 910| 1205| 1110| NW| 281| N523US| 307| 300| 285| 55| 48| DTW| SEA| 1927| 8| 14| 0| NA| 0| NA| NA| NA| NA| NA| |1993| 3| 17| 3| 630| 630| 747| 755| WN| 1968| NA| 77| 85| NA| -8| 0| SFO| SAN| 447| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA| |2002| 8| 25| 7| 1255| 1250| 1552| 1558| AA| 768| N464AA| 117| 128| 91| -6| 5| ORD| PHL| 678| 7| 19| 0| NA| 0| NA| NA| NA| NA| NA| |1995| 7| 30| 7| 2018| 1955| 2318| 2243| CO| 765| N578PE| 180| 168| 127| 35| 23| EWR| TPA| 998| 3| 50| 0| NA| 0| NA| NA| NA| NA| NA| |1990| 7| 7| 6| 1140| 1140| 1325| 1330| WN| 619| NA| 105| 110| NA| -5| 0| HOU| STL| 687| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA| |1999| 4| 20| 2| 1345| 1350| 1520| 1527| US| 1188| N428US| 95| 97| 76| -7| -5| CLT| TPA| 508| 2| 17| 0| NA| 0| NA| NA| NA| NA| NA| |1998| 4| 16| 4| 1838| 1800| 2015| 1915| WN| 1273| N319| 97| 75| 65| 60| 38| PHX| LAX| 370| 7| 25| 0| NA| 0| NA| NA| NA| NA| NA| |1991| 10| 16| 3| 955| 955| 1122| 1123| UA| 599| NA| 87| 88| NA| -1| 0| ORD| MSP| 334| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA| |2003| 4| 8| 2| 1642| 1644| 2022| 2029| AA| 909| N355AA| 160| 165| 137| -7| -2| DFW| MIA| 1121| 4| 19| 0| NA| 0| NA| NA| NA| NA| NA| +----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+ only showing top 20 rows
air_without_na.count()
4018338
air.count() # original file size
5548754
## Replace null values
air.na.fill("unknown").show()
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+ |Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay| +----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+ |2006| 7| 6| 4| 2055| 2055| 2150| 2148| XE| 2619| N11526| 55| 53| 25| 2| 0| IAH| LCH| 127| 8| 22| 0| unknown| 0| 0| 0| 0| 0| 0| |1997| 3| 2| 7| 2019| 2015| 2314| 2245| CO| 143| N59302| 295| 270| 248| 29| 4| EWR| COS| 1623| 6| 41| 0| NA| 0| NA| NA| NA| NA| NA| |1994| 5| 2| 1| 700| 700| 804| 756| AA| 1629| NA| 124| 116| NA| 8| 0| BWI| ORD| 622| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA| |1997| 2| 14| 5| 700| 700| 728| 730| WN| 1783| N332| 88| 90| 70| -2| 0| TUS| LAX| 451| 8| 10| 0| NA| 0| NA| NA| NA| NA| NA| |2000| 6| 11| 7| 2052| 2034| 2132| 2121| CO| 1753| N16893| 160| 167| 133| 11| 18| IAH| PHX| 1009| 5| 22| 0| NA| 0| NA| NA| NA| NA| NA| |1997| 11| 16| 7| 1503| 1414| 1912| 1859| AA| 414| N205AA| 189| 225| 177| 13| 49| DFW| BOS| 1562| 4| 8| 0| NA| 0| NA| NA| NA| NA| NA| |1999| 5| 31| 1| 839| 845| 1409| 1421| DL| 152| N177DZ| 210| 216| 181| -12| -6| SLC| ATL| 1589| 11| 18| 0| NA| 0| NA| NA| NA| NA| NA| |1997| 4| 30| 3| 828| 830| 1206| 1155| AA| 11| N5DAAA| 398| 385| 364| 11| -2| BOS| LAX| 2611| 5| 29| 0| NA| 0| NA| NA| NA| NA| NA| |1995| 7| 21| 5| 1018| 1000| 1126| 1113| US| 272| N274US| 68| 73| 53| 13| 18| GSO| PIT| 304| 4| 11| 0| NA| 0| NA| NA| NA| NA| NA| |1989| 9| 28| 4| 1501| 1450| 1639| 1630| AA| 905| NA| 158| 160| NA| 9| 11| ORD| DEN| 888| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA| |2007| 10| 26| 5| 654| 700| 1507| 1515| AS| 802| N648AS| 313| 315| 293| -8| -6| PDX| BOS| 2537| 9| 11| 0| unknown| 0| 0| 0| 0| 0| 0| |1993| 6| 28| 1| 1540| 1541| 2146| 2131| UA| 1746| NA| 246| 230| NA| 15| -1| SAN| ORD| 1723| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA| |1994| 3| 31| 4| 1909| 1900| 2116| 2130| TW| 819| NA| 247| 270| NA| -14| 9| STL| SFO| 1736| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA| |2000| 5| 23| 2| 958| 910| 1205| 1110| NW| 281| N523US| 307| 300| 285| 55| 48| DTW| SEA| 1927| 8| 14| 0| NA| 0| NA| NA| NA| NA| NA| |2004| 10| 18| 1| 939| 940| 1151| 1128| NW| 679| N317NB| 192| 168| 149| 23| -1| MSP| SLC| 991| 10| 33| 0| unknown| 0| 0| 0| 23| 0| 0| |2007| 5| 17| 4| 1944| 1830| 2034| 1930| AA| 1011| N3BDAA| 50| 60| 35| 64| 74| MIA| MCO| 193| 4| 11| 0| unknown| 0| 64| 0| 0| 0| 0| |2003| 7| 31| 4| 1738| 1710| 1819| 1750| DH| 7981| N309UE| 41| 40| 25| 29| 28| IAD| CHO| 77| 3| 12| 0| unknown| 0| 29| 0| 0| 0| 0| |1993| 3| 17| 3| 630| 630| 747| 755| WN| 1968| NA| 77| 85| NA| -8| 0| SFO| SAN| 447| NA| NA| 0| NA| 0| NA| NA| NA| NA| NA| |2002| 8| 25| 7| 1255| 1250| 1552| 1558| AA| 768| N464AA| 117| 128| 91| -6| 5| ORD| PHL| 678| 7| 19| 0| NA| 0| NA| NA| NA| NA| NA| |1995| 7| 30| 7| 2018| 1955| 2318| 2243| CO| 765| N578PE| 180| 168| 127| 35| 23| EWR| TPA| 998| 3| 50| 0| NA| 0| NA| NA| NA| NA| NA| +----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+ only showing top 20 rows
air.na.replace('NA', "unknown").show()
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+-------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+ |Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance| TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay| +----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+-------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+ |2006| 7| 6| 4| 2055| 2055| 2150| 2148| XE| 2619| N11526| 55| 53| 25| 2| 0| IAH| LCH| 127| 8| 22| 0| null| 0| 0| 0| 0| 0| 0| |1997| 3| 2| 7| 2019| 2015| 2314| 2245| CO| 143| N59302| 295| 270| 248| 29| 4| EWR| COS| 1623| 6| 41| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |1994| 5| 2| 1| 700| 700| 804| 756| AA| 1629|unknown| 124| 116|unknown| 8| 0| BWI| ORD| 622|unknown|unknown| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |1997| 2| 14| 5| 700| 700| 728| 730| WN| 1783| N332| 88| 90| 70| -2| 0| TUS| LAX| 451| 8| 10| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |2000| 6| 11| 7| 2052| 2034| 2132| 2121| CO| 1753| N16893| 160| 167| 133| 11| 18| IAH| PHX| 1009| 5| 22| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |1997| 11| 16| 7| 1503| 1414| 1912| 1859| AA| 414| N205AA| 189| 225| 177| 13| 49| DFW| BOS| 1562| 4| 8| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |1999| 5| 31| 1| 839| 845| 1409| 1421| DL| 152| N177DZ| 210| 216| 181| -12| -6| SLC| ATL| 1589| 11| 18| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |1997| 4| 30| 3| 828| 830| 1206| 1155| AA| 11| N5DAAA| 398| 385| 364| 11| -2| BOS| LAX| 2611| 5| 29| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |1995| 7| 21| 5| 1018| 1000| 1126| 1113| US| 272| N274US| 68| 73| 53| 13| 18| GSO| PIT| 304| 4| 11| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |1989| 9| 28| 4| 1501| 1450| 1639| 1630| AA| 905|unknown| 158| 160|unknown| 9| 11| ORD| DEN| 888|unknown|unknown| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |2007| 10| 26| 5| 654| 700| 1507| 1515| AS| 802| N648AS| 313| 315| 293| -8| -6| PDX| BOS| 2537| 9| 11| 0| null| 0| 0| 0| 0| 0| 0| |1993| 6| 28| 1| 1540| 1541| 2146| 2131| UA| 1746|unknown| 246| 230|unknown| 15| -1| SAN| ORD| 1723|unknown|unknown| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |1994| 3| 31| 4| 1909| 1900| 2116| 2130| TW| 819|unknown| 247| 270|unknown| -14| 9| STL| SFO| 1736|unknown|unknown| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |2000| 5| 23| 2| 958| 910| 1205| 1110| NW| 281| N523US| 307| 300| 285| 55| 48| DTW| SEA| 1927| 8| 14| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |2004| 10| 18| 1| 939| 940| 1151| 1128| NW| 679| N317NB| 192| 168| 149| 23| -1| MSP| SLC| 991| 10| 33| 0| null| 0| 0| 0| 23| 0| 0| |2007| 5| 17| 4| 1944| 1830| 2034| 1930| AA| 1011| N3BDAA| 50| 60| 35| 64| 74| MIA| MCO| 193| 4| 11| 0| null| 0| 64| 0| 0| 0| 0| |2003| 7| 31| 4| 1738| 1710| 1819| 1750| DH| 7981| N309UE| 41| 40| 25| 29| 28| IAD| CHO| 77| 3| 12| 0| null| 0| 29| 0| 0| 0| 0| |1993| 3| 17| 3| 630| 630| 747| 755| WN| 1968|unknown| 77| 85|unknown| -8| 0| SFO| SAN| 447|unknown|unknown| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |2002| 8| 25| 7| 1255| 1250| 1552| 1558| AA| 768| N464AA| 117| 128| 91| -6| 5| ORD| PHL| 678| 7| 19| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| |1995| 7| 30| 7| 2018| 1955| 2318| 2243| CO| 765| N578PE| 180| 168| 127| 35| 23| EWR| TPA| 998| 3| 50| 0| unknown| 0| unknown| unknown| unknown| unknown| unknown| +----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+-------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+ only showing top 20 rows
## Reload a local file becasue we changed the NA values
air = spark.read.options(header='true', inferSchema='true').csv("/data/airdelay/airdelay_small.csv")
air.groupBy().max('DayOfWeek').collect() # mus apply to a numeric column
[Row(max(DayOfWeek)=7)]
air.corr("Distance","ArrDelay")
0.008481756987561134
air.cov("Distance","ArrDelay")
140.5795326021564
air.filter(air.ArrDelay > 60).show() # filter with certain conditions
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+ |Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay| +----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+ |2007| 5| 17| 4| 1944.0| 1830.0| 2034.0| 1930.0| AA| 1011| N3BDAA| 50.0| 60.0| 35.0| 64.0| 74.0| MIA| MCO| 193.0| 4.0| 11.0| 0| null| 0| 64.0| 0.0| 0.0| 0.0| 0.0| |2004| 4| 16| 5| 1550.0| 1525.0| 1750.0| 1638.0| XE| 2602| N15948| 120.0| 73.0| 35.0| 72.0| 25.0| EWR| BWI| 169.0| 5.0| 80.0| 0| null| 0| 0.0| 0.0| 51.0| 0.0| 21.0| |2005| 7| 1| 5| 1931.0| 1723.0| 2235.0| 2017.0| FL| 831| N978AT| 124.0| 114.0| 86.0| 138.0| 128.0| MDW| ATL| 590.0| 9.0| 29.0| 0| null| 0| 0.0| 0.0| 10.0| 0.0| 128.0| |2005| 11| 22| 2| 1849.0| 1705.0| 2034.0| 1815.0| B6| 55| N585JB| 105.0| 70.0| 56.0| 139.0| 104.0| BTV| JFK| 267.0| 4.0| 45.0| 0| null| 0| 0.0| 0.0| 35.0| 0.0| 104.0| |2007| 10| 21| 7| 1725.0| 1625.0| 1837.0| 1730.0| WN| 143| N389SW| 72.0| 65.0| 42.0| 67.0| 60.0| LAS| LAX| 236.0| 12.0| 18.0| 0| null| 0| 0.0| 0.0| 7.0| 0.0| 60.0| |2005| 4| 15| 5| 1519.0| 1310.0| 1751.0| 1545.0| AS| 631| N947AS| 152.0| 155.0| 136.0| 126.0| 129.0| LAS| SEA| 866.0| 5.0| 11.0| 0| null| 0| 4.0| 0.0| 0.0| 0.0| 122.0| |2006| 1| 24| 2| 1230.0| 1115.0| 1343.0| 1228.0| OH| 5050| N784CA| 73.0| 73.0| 50.0| 75.0| 75.0| JFK| BOS| 187.0| 5.0| 18.0| 0| null| 0| 75.0| 0.0| 0.0| 0.0| 0.0| |2007| 6| 16| 6| 1525.0| 1155.0| 1636.0| 1255.0| XE| 2317| N14933| 71.0| 60.0| 44.0| 221.0| 210.0| LFT| IAH| 201.0| 10.0| 17.0| 0| null| 0| 0.0| 0.0| 104.0| 0.0| 117.0| |2006| 12| 1| 5| 1016.0| 720.0| 1226.0| 925.0| MQ| 3484| N902BC| 190.0| 185.0| 167.0| 181.0| 176.0| CHS| DFW| 987.0| 11.0| 12.0| 0| null| 0| 176.0| 0.0| 5.0| 0.0| 0.0| |2007| 8| 25| 6| 1544.0| 1430.0| 2123.0| 2015.0| AA| 2073| N612AA| 279.0| 285.0| 256.0| 68.0| 74.0| ORD| SJU| 2072.0| 4.0| 19.0| 0| null| 0| 68.0| 0.0| 0.0| 0.0| 0.0| |2006| 7| 20| 4| 1102.0| 834.0| 1445.0| 1225.0| AA| 1652| N441AA| 163.0| 171.0| 144.0| 140.0| 148.0| ORD| RSW| 1120.0| 2.0| 17.0| 0| null| 0| 0.0| 140.0| 0.0| 0.0| 0.0| |2006| 7| 23| 7| 2110.0| 1920.0| 2228.0| 2035.0| WN| 2347| N433| 78.0| 75.0| 57.0| 113.0| 110.0| SNA| OAK| 371.0| 6.0| 15.0| 0| null| 0| 0.0| 0.0| 3.0| 4.0| 106.0| |2007| 1| 14| 7| 1110.0| 850.0| 1216.0| 1010.0| 9E| 5968| 85889E| 66.0| 80.0| 44.0| 126.0| 140.0| MEM| STL| 256.0| 5.0| 17.0| 0| null| 0| 126.0| 0.0| 0.0| 0.0| 0.0| |2004| 1| 11| 7| 1530.0| 1255.0| 1815.0| 1545.0| WN| 1405| N630| 105.0| 110.0| 94.0| 150.0| 155.0| OAK| PHX| 646.0| 3.0| 8.0| 0| null| 0| 150.0| 0.0| 0.0| 0.0| 0.0| |2007| 10| 26| 5| 1603.0| 1448.0| 1900.0| 1752.0| UA| 270| N535UA| 117.0| 124.0| 105.0| 68.0| 75.0| SNA| DEN| 846.0| 5.0| 7.0| 0| null| 0| 0.0| 0.0| 0.0| 0.0| 68.0| |2005| 10| 31| 1| 854.0| 858.0| 1136.0| 1000.0| MQ| 3384| N852AE| 162.0| 62.0| 41.0| 96.0| -4.0| SJT| DFW| 228.0| 109.0| 12.0| 0| null| 0| 0.0| 0.0| 96.0| 0.0| 0.0| |2006| 1| 6| 5| 2200.0| 2030.0| 2320.0| 2214.0| OH| 5830| N408CA| 140.0| 164.0| 119.0| 66.0| 90.0| LGA| BHM| 866.0| 5.0| 16.0| 0| null| 0| 0.0| 0.0| 0.0| 0.0| 66.0| |2005| 6| 1| 3| 1729.0| 1610.0| 1847.0| 1740.0| AA| 1966| N4UAAA| 78.0| 90.0| 62.0| 67.0| 79.0| DFW| MCI| 460.0| 3.0| 13.0| 0| null| 0| 67.0| 0.0| 0.0| 0.0| 0.0| |2007| 7| 13| 5| 1840.0| 1745.0| 2007.0| 1905.0| EV| 4263| N879AS| 147.0| 140.0| 125.0| 62.0| 55.0| DCA| JAN| 860.0| 6.0| 16.0| 0| null| 0| 55.0| 0.0| 7.0| 0.0| 0.0| |2006| 2| 26| 7| 1953.0| 1745.0| 2145.0| 1940.0| OO| 6765| N750SK| 112.0| 115.0| 92.0| 125.0| 128.0| DEN| TUS| 639.0| 6.0| 14.0| 0| null| 0| 125.0| 0.0| 0.0| 0.0| 0.0| +----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+ only showing top 20 rows
## air2 = air.select(["DayOfWeek","ArrDelay","AirTime","Distance"])
air2_pdf = air.select(["DayOfWeek", "ArrDelay","AirTime","Distance"]).toPandas()
air2_pdf
DayOfWeek | ArrDelay | AirTime | Distance | |
---|---|---|---|---|
0 | 4.0 | 2.0 | 25.0 | 127.0 |
1 | 7.0 | 29.0 | 248.0 | 1623.0 |
2 | NaN | NaN | NaN | NaN |
3 | 5.0 | -2.0 | 70.0 | 451.0 |
4 | 7.0 | 11.0 | 133.0 | 1009.0 |
5 | 7.0 | 13.0 | 177.0 | 1562.0 |
6 | 1.0 | -12.0 | 181.0 | 1589.0 |
7 | 3.0 | 11.0 | 364.0 | 2611.0 |
8 | 5.0 | 13.0 | 53.0 | 304.0 |
9 | NaN | NaN | NaN | NaN |
10 | 5.0 | -8.0 | 293.0 | 2537.0 |
11 | NaN | NaN | NaN | NaN |
12 | NaN | NaN | NaN | NaN |
13 | 2.0 | 55.0 | 285.0 | 1927.0 |
14 | 1.0 | 23.0 | 149.0 | 991.0 |
15 | 4.0 | 64.0 | 35.0 | 193.0 |
16 | 4.0 | 29.0 | 25.0 | 77.0 |
17 | NaN | NaN | NaN | NaN |
18 | 7.0 | -6.0 | 91.0 | 678.0 |
19 | 7.0 | 35.0 | 127.0 | 998.0 |
20 | NaN | NaN | NaN | NaN |
21 | 2.0 | -7.0 | 76.0 | 508.0 |
22 | 4.0 | 60.0 | 65.0 | 370.0 |
23 | 4.0 | -7.0 | 66.0 | 407.0 |
24 | NaN | NaN | NaN | NaN |
25 | 3.0 | 35.0 | 313.0 | 2421.0 |
26 | 2.0 | -7.0 | 137.0 | 1121.0 |
27 | NaN | NaN | NaN | NaN |
28 | 5.0 | 12.0 | 137.0 | 1185.0 |
29 | 1.0 | -1.0 | 46.0 | 272.0 |
... | ... | ... | ... | ... |
5548724 | NaN | NaN | NaN | NaN |
5548725 | 1.0 | -11.0 | 110.0 | 846.0 |
5548726 | 3.0 | 2.0 | 77.0 | 612.0 |
5548727 | 2.0 | 3.0 | 155.0 | 1087.0 |
5548728 | NaN | NaN | NaN | NaN |
5548729 | 1.0 | 8.0 | 131.0 | 984.0 |
5548730 | 7.0 | 31.0 | 153.0 | 1086.0 |
5548731 | 4.0 | 5.0 | 91.0 | 641.0 |
5548732 | 7.0 | -13.0 | 124.0 | 1005.0 |
5548733 | NaN | NaN | NaN | NaN |
5548734 | 2.0 | -7.0 | 214.0 | 1900.0 |
5548735 | 5.0 | -7.0 | 159.0 | 1195.0 |
5548736 | 1.0 | -1.0 | 108.0 | 773.0 |
5548737 | NaN | NaN | NaN | NaN |
5548738 | NaN | NaN | NaN | NaN |
5548739 | NaN | NaN | NaN | NaN |
5548740 | NaN | NaN | NaN | NaN |
5548741 | NaN | NaN | NaN | NaN |
5548742 | NaN | NaN | NaN | NaN |
5548743 | 6.0 | -13.0 | 40.0 | 160.0 |
5548744 | NaN | NaN | NaN | NaN |
5548745 | 1.0 | 10.0 | 63.0 | 369.0 |
5548746 | NaN | NaN | NaN | NaN |
5548747 | 5.0 | -6.0 | 203.0 | 1471.0 |
5548748 | NaN | NaN | NaN | NaN |
5548749 | 3.0 | 13.0 | 59.0 | 318.0 |
5548750 | 1.0 | 22.0 | 34.0 | 181.0 |
5548751 | 1.0 | 11.0 | 71.0 | 551.0 |
5548752 | NaN | NaN | NaN | NaN |
5548753 | 2.0 | -14.0 | 107.0 | 888.0 |
5548754 rows × 4 columns
def myfun(pdf):
out = dict()
out["ArrDelay"] = pdf.ArrDelay.mean()
out["AirTime"] = pdf.AirTime.mean()
out["Distance"] = pdf.Distance.mean()
return pd.DataFrame(out, index=[0])
myfun(air2_pdf)
import pandas as pd
from pyspark.sql.functions import pandas_udf, PandasUDFType
@pandas_udf("ArrDelay long, AirTime long, Distance long", PandasUDFType .GROUPED_MAP)
def myfun(pdf):
out = dict()
out["ArrDelay"] = pdf.ArrDelay.mean()
out["AirTime"] = pdf.AirTime.mean()
out["Distance"] = pdf.Distance.mean()
return pd.DataFrame(out, index=[0])
air2 = air.select(["DayOfWeek","ArrDelay","AirTime","Distance"])
# air3 = air2.na.drop()
# air3.groupby("DayOfWeek").apply(myfun).show()