PySpark Coding Challenge: Word Count from Text Data
Problem Statement:You are given a dataset containing multiple lines of text. Write a PySpark script to count the occurrence of each word across all lines and display the top 5 most frequent words in descending order.
from pyspark.sql import SparkSession
# Initialize Spark session
spark = SparkSession.builder.appName("WordCount").getOrCreate()
# Sample text data
data = [
"Apache Spark is amazing",
"PySpark makes big data processing easy",
"Big data analytics is the future",
"Spark is fast and efficient"
]
# Create RDD
rdd = spark.sparkContext.parallelize(data)
# Split lines into words and count occurrences
word_counts = (
rdd.flatMap(lambda line: line.split(" ")) # Split each line into words
.map(lambda word: (word.lower(), 1)) # Convert to lowercase and map each word to (word, 1)
.reduceByKey(lambda a, b: a + b) # Reduce by key to get word counts
)
# Get the top 5 words
top_words = word_counts.takeOrdered(5, key=lambda x: -x[1])
# Print the results
for word, count in top_words:
print(f"{word}: {count}")
# Stop the Spark session
spark.stop()