aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pySparkExample.py24
-rw-r--r--requirements.txt2
2 files changed, 25 insertions, 1 deletions
diff --git a/pySparkExample.py b/pySparkExample.py
new file mode 100644
index 0000000..4037b74
--- /dev/null
+++ b/pySparkExample.py
@@ -0,0 +1,24 @@
+from pyspark import SparkContext,StorageLevel
+from pyspark.sql import SparkSession
+from pyspark.conf import SparkConf
+from pyspark.sql.types import *
+
+
+# Enabling Spark Configuration and SparkSession
+sconf=SparkConf().setAppName("test")
+spark=SparkSession.builder.config(conf=sconf).getOrCreate()
+
+# RDD as a list of tuples
+rdd = spark.sparkContext.parallelize([('Alex',21),('Bob',44)])
+
+# creating a schema using StructType
+schema = StructType([
+ StructField("name", StringType(), True),
+ StructField("age", IntegerType(), True)])
+
+# Creating a dataframe from rdd using schema
+df=spark.createDataFrame(rdd, schema)
+
+# displaying dataframe
+df.show(truncate=False)
+
diff --git a/requirements.txt b/requirements.txt
index 5da3fc9..d911d32 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
kafka-python
python-dotenv
tweepy==3.9.0
-pyspark==2.4.6
+pyspark==3.0.2
spacy
sklearn
cassandra-driver