import findspark ## Only needed when you run spark witin Jupyter notebook
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .config("spark.executor.memory", "2g")\
        .config("spark.cores.max", "2")\
        .master("spark://master:7077")\
        .appName("Python Spark").getOrCreate() # using spark server


from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])


tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenizer

Tokenizer_5ab7592eb9d8


wordsData = tokenizer.transform(sentenceData)
wordsData.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  1.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+


hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
featurizedData.show()

+-----+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|
+-----+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(20,[0,5,9,17],[1...|
|  0.0|I wish Java could...|[i, wish, java, c...|(20,[2,7,9,13,15]...|
|  1.0|Logistic regressi...|[logistic, regres...|(20,[4,6,13,15,18...|
+-----+--------------------+--------------------+--------------------+


# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)


rescaledData.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(20,[0,5,9,17],[0...|
|  0.0|(20,[2,7,9,13,15]...|
|  1.0|(20,[4,6,13,15,18...|
+-----+--------------------+


from pyspark.ml.feature import Word2Vec

# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])


# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [-0.03411836884915829,-0.0009284719824790955,-0.029798223078250887]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [-0.07186858262866735,-0.046450747177004814,0.016944138316570646]

Text: [Logistic, regression, models, are, neat] => 
Vector: [0.020746785402297976,-0.019827502034604552,-0.01424809228628874]


from pyspark.ml.feature import StopWordsRemover

sentenceData = spark.createDataFrame([
    (0, ["I", "saw", "the", "red", "balloon"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "raw"])

remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)

+---+----------------------------+--------------------+
|id |raw                         |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+


from pyspark.ml.feature import NGram

wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)

+------------------------------------------------------------------+
|ngrams                                                            |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+


from pyspark.ml.clustering import LDA

# Loads data.
dataset = spark.read.format("libsvm").load("/data/sample_lda_libsvm_data.txt")
dataset.head(10)

[Row(label=0.0, features=SparseVector(11, {0: 1.0, 1: 2.0, 2: 6.0, 4: 2.0, 5: 3.0, 6: 1.0, 7: 1.0, 10: 3.0})),
 Row(label=1.0, features=SparseVector(11, {0: 1.0, 1: 3.0, 3: 1.0, 4: 3.0, 7: 2.0, 10: 1.0})),
 Row(label=2.0, features=SparseVector(11, {0: 1.0, 1: 4.0, 2: 1.0, 5: 4.0, 6: 9.0, 8: 1.0, 9: 2.0})),
 Row(label=3.0, features=SparseVector(11, {0: 2.0, 1: 1.0, 3: 3.0, 6: 5.0, 8: 2.0, 9: 3.0, 10: 9.0})),
 Row(label=4.0, features=SparseVector(11, {0: 3.0, 1: 1.0, 2: 1.0, 3: 9.0, 4: 3.0, 6: 2.0, 9: 1.0, 10: 3.0})),
 Row(label=5.0, features=SparseVector(11, {0: 4.0, 1: 2.0, 3: 3.0, 4: 4.0, 5: 5.0, 6: 1.0, 7: 1.0, 8: 1.0, 9: 4.0})),
 Row(label=6.0, features=SparseVector(11, {0: 2.0, 1: 1.0, 3: 3.0, 6: 5.0, 8: 2.0, 9: 2.0, 10: 9.0})),
 Row(label=7.0, features=SparseVector(11, {0: 1.0, 1: 1.0, 2: 1.0, 3: 9.0, 4: 2.0, 5: 1.0, 6: 2.0, 9: 1.0, 10: 3.0})),
 Row(label=8.0, features=SparseVector(11, {0: 4.0, 1: 4.0, 3: 3.0, 4: 4.0, 5: 2.0, 6: 1.0, 7: 3.0})),
 Row(label=9.0, features=SparseVector(11, {0: 2.0, 1: 8.0, 2: 2.0, 4: 3.0, 6: 2.0, 8: 2.0, 9: 7.0, 10: 2.0}))]


# Trains a LDA model.
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)


ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -804.0989073878557
The upper bound on perplexity: 3.0926881053379063


# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[10, 3, 6] |[0.19648814891312644, 0.12264707684168723, 0.11548351545500055]|
|1    |[7, 3, 1]  |[0.10244520712465478, 0.09851402141047443, 0.09806248098689352]|
|2    |[0, 6, 2]  |[0.10564528489655169, 0.10356112002939157, 0.09915915143947969]|
|3    |[9, 1, 3]  |[0.10307332123604386, 0.10142717450626668, 0.0986764216315743] |
|4    |[10, 9, 8] |[0.10333739093539139, 0.10116857673363602, 0.09877224621583713]|
|5    |[0, 5, 9]  |[0.11848603167315717, 0.09725217268738957, 0.09659258654778731]|
|6    |[5, 10, 8] |[0.09759768608984509, 0.09671823716617234, 0.09525930143075946]|
|7    |[4, 1, 7]  |[0.13467157815466327, 0.1327400186510903, 0.12373463860503997] |
|8    |[8, 7, 4]  |[0.10731631967008559, 0.10084194789873172, 0.09424650066779519]|
|9    |[0, 9, 8]  |[0.10390473951443446, 0.09958354757343096, 0.09790273488004557]|
+-----+-----------+---------------------------------------------------------------+


# Shows the result
transformed = model.transform(dataset)
transformed.show(truncate=False)

+-----+---------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                       |topicDistribution                                                                                                                                                                                                      |
+-----+---------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(11,[0,1,2,4,5,6,7,10],[1.0,2.0,6.0,2.0,3.0,1.0,1.0,3.0])      |[0.9568662978542513,0.004783231116573741,0.0047831892086631,0.0047831553315500945,0.004783188785539371,0.004783193858726235,0.004783184596976723,0.004868166306930777,0.0047832097974883974,0.004783183143300183]      |
|1.0  |(11,[0,1,3,4,7,10],[1.0,3.0,1.0,3.0,2.0,1.0])                  |[0.009280598104452465,0.007979172439262976,0.007978984236213103,0.00797913388815745,0.007979049712179791,0.007979053902499032,0.007979010002889992,0.9268869871272688,0.007979007426226103,0.007979003160850305]       |
|2.0  |(11,[0,1,2,5,6,8,9],[1.0,4.0,1.0,4.0,9.0,1.0,2.0])             |[0.9625005771994888,0.004158347065222438,0.004158451203311232,0.004158374347463623,0.0041584444109197165,0.004158407186431911,0.00415843599182822,0.004232122424087573,0.004158418182959676,0.0041584219882868515]     |
|3.0  |(11,[0,1,3,6,8,9,10],[2.0,1.0,3.0,5.0,2.0,3.0,9.0])            |[0.9668336046977241,0.003677901638530038,0.003677936129834121,0.0036779146317183742,0.0036779378355528075,0.003677903606756995,0.003677920152144397,0.003743048030026147,0.003677909443500446,0.003677923834212565]    |
|4.0  |(11,[0,1,2,3,4,6,9,10],[3.0,1.0,1.0,9.0,3.0,2.0,1.0,3.0])      |[0.9640660059489621,0.003984832354457607,0.003984821632987429,0.003984837685632279,0.003984774805402123,0.00398485771918617,0.003984779258684711,0.004055526744363853,0.003984785208479665,0.003984778641844029]       |
|5.0  |(11,[0,1,3,4,5,6,7,8,9],[4.0,2.0,3.0,4.0,5.0,1.0,1.0,1.0,4.0]) |[0.9668304180932527,0.003678232144159046,0.0036782065880795662,0.00367824028937495,0.003678249420227395,0.0036783342717041398,0.003678217515123681,0.003743596945962773,0.0036782560567981686,0.003678248675317552]    |
|6.0  |(11,[0,1,3,6,8,9,10],[2.0,1.0,3.0,5.0,2.0,2.0,9.0])            |[0.9655055056096569,0.0038251780129399623,0.003825216006441113,0.003825187209191156,0.003825212448429147,0.0038251778572665237,0.0038252006703108576,0.0038929381712159996,0.0038251853762541967,0.0038251986382942375]|
|7.0  |(11,[0,1,2,3,4,5,6,9,10],[1.0,1.0,1.0,9.0,2.0,1.0,2.0,1.0,3.0])|[0.9607952133244586,0.0043475489965052305,0.004347535035794484,0.004347547751809199,0.004347500969354192,0.0043475378613077684,0.00434750374977543,0.004424629967361196,0.004347501819601182,0.0043474805240328275]    |
|8.0  |(11,[0,1,3,4,5,6,7],[4.0,4.0,3.0,4.0,2.0,1.0,3.0])             |[0.005056141817231779,0.004347745875485573,0.004347733470530278,0.004347748784265513,0.004347672802770931,0.004347785379700588,0.004347700833691274,0.960162060634373,0.004347709392002069,0.004347701009948812]       |
|9.0  |(11,[0,1,2,4,6,8,9,10],[2.0,8.0,2.0,3.0,2.0,2.0,7.0,2.0])      |[0.9702669667917015,0.00329716969642816,0.003297142639560103,0.0032971866699542164,0.003297182147463212,0.0032971616612426802,0.0032971329199629517,0.0033557145571101255,0.0032971661914050106,0.0032971767251719812] |
|10.0 |(11,[0,1,2,3,5,6,9,10],[1.0,1.0,1.0,9.0,2.0,2.0,3.0,3.0])      |[0.9625016669316593,0.004158300604862769,0.004158313303796998,0.004158329547472363,0.004158276239090774,0.004158368051309281,0.004158278501352177,0.004231937886622162,0.0041582690160406,0.0041582599177935504]       |
|11.0 |(11,[0,1,4,5,6,7,9],[4.0,1.0,4.0,5.0,1.0,3.0,1.0])             |[0.00556182931266099,0.0047831811109948375,0.004783220683789862,0.004783206912349498,0.004783234588561117,0.004783314587201609,0.004783211974102557,0.9561723035902198,0.004783232548461842,0.004783264691657824]      |
+-----+---------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

Text Processing with Spark¶

Feng Li¶

Central University of Finance and Economics¶

feng.li@cufe.edu.cn ¶

Course home page: https://feng.li/distcomp ¶

Concepts in text processing¶

Corpora¶

Tokens¶

Stopwords¶

Stemming¶

Frequency Counts¶

Word Segmenter¶

Part-Of-Speech Tagger¶

Named Entity Recognizer¶

Text Feature Extractors¶

Word2Vec¶

Remove stop words¶

n-gram¶

Topic modelling with LDA¶

Lab¶

Text Processing with Spark¶

Feng Li¶

Central University of Finance and Economics¶

feng.li@cufe.edu.cn¶

Course home page: https://feng.li/distcomp¶

Concepts in text processing¶

Corpora¶

Tokens¶

Stopwords¶

Stemming¶

Frequency Counts¶

Word Segmenter¶

Part-Of-Speech Tagger¶

Named Entity Recognizer¶

Text Feature Extractors¶

Word2Vec¶

Remove stop words¶

n-gram¶

Topic modelling with LDA¶

Lab¶

feng.li@cufe.edu.cn ¶

Course home page: https://feng.li/distcomp ¶