diff options
| author | jmreddy2106 <[email protected]> | 2021-12-25 18:25:17 -0500 |
|---|---|---|
| committer | jmreddy2106 <[email protected]> | 2021-12-25 18:25:17 -0500 |
| commit | 101072ea24cc0670bbd689d039d5144fa5216a4b (patch) | |
| tree | 75674afa8c9a68e2a94b683e654a1403105af67b | |
| parent | 7ceb40394bd40075e7fe41b8c125a5d2dcedc781 (diff) | |
| download | KafkaPySpark-101072ea24cc0670bbd689d039d5144fa5216a4b.tar.xz KafkaPySpark-101072ea24cc0670bbd689d039d5144fa5216a4b.zip | |
PySpark example
| -rw-r--r-- | pySparkExample.py | 24 | ||||
| -rw-r--r-- | requirements.txt | 2 |
2 files changed, 25 insertions, 1 deletions
diff --git a/pySparkExample.py b/pySparkExample.py new file mode 100644 index 0000000..4037b74 --- /dev/null +++ b/pySparkExample.py @@ -0,0 +1,24 @@ +from pyspark import SparkContext,StorageLevel +from pyspark.sql import SparkSession +from pyspark.conf import SparkConf +from pyspark.sql.types import * + + +# Enabling Spark Configuration and SparkSession +sconf=SparkConf().setAppName("test") +spark=SparkSession.builder.config(conf=sconf).getOrCreate() + +# RDD as a list of tuples +rdd = spark.sparkContext.parallelize([('Alex',21),('Bob',44)]) + +# creating a schema using StructType +schema = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True)]) + +# Creating a dataframe from rdd using schema +df=spark.createDataFrame(rdd, schema) + +# displaying dataframe +df.show(truncate=False) + diff --git a/requirements.txt b/requirements.txt index 5da3fc9..d911d32 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ kafka-python python-dotenv tweepy==3.9.0 -pyspark==2.4.6 +pyspark==3.0.2 spacy sklearn cassandra-driver |
