In [1]:
import pyspark

In [2]:
# Get an interactive spark context
sc = pyspark.context.SparkContext.getOrCreate()

In [4]:
# Create distributed dataset (RDD) containing the Wikipedia article text content read from HDFS.
# By default textFile splits the input by line into records.

# Here I have 3 text files saved in HDFS under /user/daniel:
#    httpsen.wikipedia.orgwikiGhazanchetsots_Cathedral.txt
#    httpsen.wikipedia.orgwikiNumerical_modeling_geology.txt
#    httpsen.wikipedia.orgwikiReceiver_operating_characteristic.txt
text = sc.textFile('hdfs:///user/daniel/httpsen*')

In [5]:
# 1. Split each line from the input text into words.
#    I also filter words that are non-alphanumeric or only one character long.
# 2. Create key-value pairs for each word.
#    Transform each word to lower-case and returns a pair (word, 1)
# 3. Group all key-value pairs with the same key and sum their values.
counts = text.flatMap(lambda line: [word for word in line.split() if len(word) > 1 and word.isalnum()]) \
    .map(lambda word: (word.lower(), 1)) \
    .reduceByKey(lambda count1, count2: count1 + count2)

In [6]:
# Print the first 20 word counts. 
for wordcount in counts.take(20):
    print(wordcount)

('modeling', 40)
('from', 63)
('the', 1192)
('free', 6)
('of', 794)
('widely', 4)
('complex', 6)
('physical', 15)
('are', 96)
('as', 92)
('partial', 13)
('with', 77)
('geologists', 2)
('finite', 74)
('difference', 25)
('experiments', 4)
('results', 13)
('both', 10)
('developed', 7)
('modelling', 7)


In [8]:
counts.count()

3063

In [9]:
# IMPORTANT: stop the spark context to successfully finnish the application.
sc.stop()