Merge pull request #8 from luciferreeves/main

Tkinter Application Added
author: Bobby <[email protected]> 2022-12-02 17:02:31 -0500
committer: GitHub <[email protected]> 2022-12-02 17:02:31 -0500
commit: af2ada7022d75411f2d74a4a4d2c95dfe3eb2e3a (patch)
tree: 327819303a94662dfcfd921864d2e1134fffd93e /src/helpers
parent: 0eb9ac45dcf195e65ea4941229c7e32f17b46d87 (diff)
parent: c4f746246a66ed644f8cd123f1dd0081abcd1a55 (diff)
download: RedditEngagementPrediction-af2ada7022d75411f2d74a4a4d2c95dfe3eb2e3a.tar.xz
RedditEngagementPrediction-af2ada7022d75411f2d74a4a4d2c95dfe3eb2e3a.zip
4 files changed, 114 insertions, 0 deletions
diff --git a/src/helpers/__init__.py b/src/helpers/__init__.py
new file mode 100644
index 0000000..93f7c10
--- /dev/null
+++ b/src/helpers/__init__.py
@@ -0,0 +1,2 @@
+from .database_handler import DatabaseHandler
+from .preprocessor import Preprocessor
diff --git a/src/helpers/database_handler.py b/src/helpers/database_handler.py
new file mode 100644
index 0000000..af86e6e
--- /dev/null
+++ b/src/helpers/database_handler.py
@@ -0,0 +1,15 @@
+import pandas as pd
+
+
+class DatabaseHandler:
+
+    def write(self, df):
+        # save the data to csv
+        df.to_csv('reddit.csv', index=False)
+
+    def read(self):
+        try:
+            df = pd.read_csv('reddit.csv')
+            return df
+        except FileNotFoundError:
+            return pd.DataFrame()
+\ No newline at end of file
diff --git a/src/helpers/preprocessor.py b/src/helpers/preprocessor.py
new file mode 100644
index 0000000..692e036
--- /dev/null
+++ b/src/helpers/preprocessor.py
@@ -0,0 +1,95 @@
+import pandas as pd
+
+
+class Preprocessor:
+    def __init__(self, dataframe):
+        self.df = dataframe
+        # Finding Saturated Columns – Columns with same values in all rows
+        saturated_cols = []
+        for col in self.df.columns:
+            first_value = self.df[col].iloc[0]
+            if self.df[col].equals(pd.Series([first_value] * len(self.df[col]))):
+                saturated_cols.append(col)
+
+        # At this point, we can drop the saturated columns as they don't provide any useful information
+        self.df.drop(saturated_cols, axis=1, inplace=True)
+
+        # Replace all NaN values with 0 if the column is numeric or empty string if the column is string
+        for col in self.df.columns:
+            if self.df[col].dtype == 'float64' or self.df[col].dtype == 'int64':
+                self.df[col].fillna(0, inplace=True)
+            if self.df[col].dtype == 'object':
+                self.df[col].fillna('', inplace=True)
+
+        # Replace all NaN values with 0 if the column is numeric or empty string if the column is string
+        for col in self.df.columns:
+            if self.df[col].dtype == 'float64' or self.df[col].dtype == 'int64':
+                self.df[col].fillna(0, inplace=True)
+            if self.df[col].dtype == 'object':
+                self.df[col].fillna('', inplace=True)
+
+        # Convert column to string if column is not numeric or boolean
+        for col in self.df.columns:
+            if self.df[col].dtype != 'float64' and self.df[col].dtype != 'int64' and self.df[col].dtype != 'bool':
+                self.df[col] = self.df[col].astype(str)
+
+        # Check for title duplicates
+        print('Duplicate titles: {}'.format(self.df['title'].duplicated().sum()))
+
+        # Same post data can be repeated from the API -Delete titles that appear more than once
+        self.df.drop_duplicates(subset=['title'], keep='first', inplace=True)
+
+        # Find all columns that contain 'flair'
+        columns = list(self.df.columns)
+        flair_columns = self.search(columns, 'flair')
+        # remove everything from df columns except link_flair_text and author_flair_text
+        flair_columns = list(filter(lambda x: x not in ['link_flair_text', 'author_flair_text'], flair_columns))
+        self.df.drop(flair_columns, axis=1, inplace=True)
+
+        # Any rows containing [deleted] and [removed] are not useful for our analysis. Find any rows with these values and drop them.
+        columns = list(self.df.columns)
+        for column in columns:
+            self.df = self.df[self.df[column] != '[deleted]']
+            self.df = self.df[self.df[column] != '[removed]']
+        
+        # Remove all posts which are polls - where poll_data is not ""
+        try:
+            self.df = self.df[self.df['poll_data'] == '']
+        except:
+            pass
+
+        self.df['created_utc'] = pd.to_datetime(self.df['created_utc'], unit='s')
+        self.df['hour'] = self.df['created_utc'].dt.hour
+        self.df['day'] = self.df['created_utc'].dt.day_name()
+        # self.df.drop('created_utc', axis=1, inplace=True)
+
+        cols_to_keep = ['title', 'selftext', 'link_flair_text', 'subreddit', 'ups', 'num_comments', 'hour', 'day', 'distinguished', 'author_premium', 'subreddit_subscribers', 'author', 'score', 'created_utc', 'upvote_ratio']
+        self.df = self.df[cols_to_keep]
+
+        # "distinguished" coloumn has 2 values - "moderator" and "" - We can convert this to a boolean column
+        self.df['distinguished'] = self.df['distinguished'].apply(lambda x: True if x == 'moderator' else False)
+
+        # Convert author_premium to boolean
+        self.df['author_premium'] = self.df['author_premium'].apply(lambda x: True if x == True else False)
+
+        # Convert title, selftext, link_flair_text, subreddit to string
+        self.df['title'] = self.df['title'].astype(str)
+        self.df['selftext'] = self.df['selftext'].astype(str)
+        self.df['link_flair_text'] = self.df['link_flair_text'].astype(str)
+        self.df['subreddit'] = self.df['subreddit'].astype(str)
+        self.df['day'] = self.df['day'].astype(str)
+        self.df['distinguished'] = self.df['distinguished'].astype(bool)
+        self.df['hour'] = self.df['hour'].astype(int)
+        self.df['ups'] = self.df['ups'].astype(int)
+        self.df['num_comments'] = self.df['num_comments'].astype(int)
+
+    # Supplimentary Column Search Function
+    def search(self, array, search_term):
+        """
+        Returns a list of strings that contain the search term.
+        """
+        return [string for string in array if search_term in string]
+
+
+    def get_preprocessed_data(self):
+        return self.df
+\ No newline at end of file
diff --git a/src/helpers/subreddits.py b/src/helpers/subreddits.py
new file mode 100644
index 0000000..b6242c1
--- /dev/null
+++ b/src/helpers/subreddits.py
@@ -0,0 +1,2 @@
+# Define Subreddit List
+SUBREDDITS = ['Python', 'datascience', 'javascript', 'linux', 'opensource', 'node', 'programming', 'computerscience', 'webdev', 'statistics', 'MachineLearning', 'compsci', 'java', 'rust', 'typescript']
+\ No newline at end of file
author	Bobby <[email protected]>	2022-12-02 17:02:31 -0500
committer	GitHub <[email protected]>	2022-12-02 17:02:31 -0500
commit	af2ada7022d75411f2d74a4a4d2c95dfe3eb2e3a (patch)
tree	327819303a94662dfcfd921864d2e1134fffd93e /src/helpers
parent	0eb9ac45dcf195e65ea4941229c7e32f17b46d87 (diff)
parent	c4f746246a66ed644f8cd123f1dd0081abcd1a55 (diff)
download	RedditEngagementPrediction-af2ada7022d75411f2d74a4a4d2c95dfe3eb2e3a.tar.xz RedditEngagementPrediction-af2ada7022d75411f2d74a4a4d2c95dfe3eb2e3a.zip