From 101072ea24cc0670bbd689d039d5144fa5216a4b Mon Sep 17 00:00:00 2001 From: jmreddy2106 Date: Sat, 25 Dec 2021 18:25:17 -0500 Subject: PySpark example --- pySparkExample.py | 24 ++++++++++++++++++++++++ requirements.txt | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 pySparkExample.py diff --git a/pySparkExample.py b/pySparkExample.py new file mode 100644 index 0000000..4037b74 --- /dev/null +++ b/pySparkExample.py @@ -0,0 +1,24 @@ +from pyspark import SparkContext,StorageLevel +from pyspark.sql import SparkSession +from pyspark.conf import SparkConf +from pyspark.sql.types import * + + +# Enabling Spark Configuration and SparkSession +sconf=SparkConf().setAppName("test") +spark=SparkSession.builder.config(conf=sconf).getOrCreate() + +# RDD as a list of tuples +rdd = spark.sparkContext.parallelize([('Alex',21),('Bob',44)]) + +# creating a schema using StructType +schema = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True)]) + +# Creating a dataframe from rdd using schema +df=spark.createDataFrame(rdd, schema) + +# displaying dataframe +df.show(truncate=False) + diff --git a/requirements.txt b/requirements.txt index 5da3fc9..d911d32 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ kafka-python python-dotenv tweepy==3.9.0 -pyspark==2.4.6 +pyspark==3.0.2 spacy sklearn cassandra-driver -- cgit v1.2.3