PySpark example

author: jmreddy2106 <[email protected]> 2021-12-25 18:25:17 -0500
committer: jmreddy2106 <[email protected]> 2021-12-25 18:25:17 -0500
commit: 101072ea24cc0670bbd689d039d5144fa5216a4b (patch)
tree: 75674afa8c9a68e2a94b683e654a1403105af67b
parent: 7ceb40394bd40075e7fe41b8c125a5d2dcedc781 (diff)
download: KafkaPySpark-101072ea24cc0670bbd689d039d5144fa5216a4b.tar.xz
KafkaPySpark-101072ea24cc0670bbd689d039d5144fa5216a4b.zip
2 files changed, 25 insertions, 1 deletions
diff --git a/pySparkExample.py b/pySparkExample.py
new file mode 100644
index 0000000..4037b74
--- /dev/null
+++ b/pySparkExample.py
@@ -0,0 +1,24 @@
+from pyspark import SparkContext,StorageLevel
+from pyspark.sql import SparkSession
+from pyspark.conf import SparkConf
+from pyspark.sql.types import *
+
+
+# Enabling Spark Configuration and SparkSession
+sconf=SparkConf().setAppName("test")
+spark=SparkSession.builder.config(conf=sconf).getOrCreate()
+
+# RDD as a list of tuples 
+rdd = spark.sparkContext.parallelize([('Alex',21),('Bob',44)])
+
+# creating a schema using StructType
+schema = StructType([
+   StructField("name", StringType(), True),
+   StructField("age", IntegerType(), True)])
+
+# Creating a dataframe from rdd using schema
+df=spark.createDataFrame(rdd, schema)
+
+# displaying dataframe 
+df.show(truncate=False)
+
diff --git a/requirements.txt b/requirements.txt
index 5da3fc9..d911d32 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 kafka-python
 python-dotenv
 tweepy==3.9.0
-pyspark==2.4.6
+pyspark==3.0.2
 spacy
 sklearn
 cassandra-driver
author	jmreddy2106 <[email protected]>	2021-12-25 18:25:17 -0500
committer	jmreddy2106 <[email protected]>	2021-12-25 18:25:17 -0500
commit	101072ea24cc0670bbd689d039d5144fa5216a4b (patch)
tree	75674afa8c9a68e2a94b683e654a1403105af67b
parent	7ceb40394bd40075e7fe41b8c125a5d2dcedc781 (diff)
download	KafkaPySpark-101072ea24cc0670bbd689d039d5144fa5216a4b.tar.xz KafkaPySpark-101072ea24cc0670bbd689d039d5144fa5216a4b.zip