aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBobby <[email protected]>2022-12-02 17:02:31 -0500
committerGitHub <[email protected]>2022-12-02 17:02:31 -0500
commitaf2ada7022d75411f2d74a4a4d2c95dfe3eb2e3a (patch)
tree327819303a94662dfcfd921864d2e1134fffd93e
parent0eb9ac45dcf195e65ea4941229c7e32f17b46d87 (diff)
parentc4f746246a66ed644f8cd123f1dd0081abcd1a55 (diff)
downloadRedditEngagementPrediction-af2ada7022d75411f2d74a4a4d2c95dfe3eb2e3a.tar.xz
RedditEngagementPrediction-af2ada7022d75411f2d74a4a4d2c95dfe3eb2e3a.zip
Merge pull request #8 from luciferreeves/main
Tkinter Application Added
-rw-r--r--.gitignore132
-rw-r--r--models/DecisionTreeRegressor_num_comments.pklbin14990 -> 0 bytes
-rw-r--r--models/DecisionTreeRegressor_ups.pklbin14734 -> 0 bytes
-rw-r--r--models/DummyRegressor_num_comments.pklbin296 -> 0 bytes
-rw-r--r--models/DummyRegressor_ups.pklbin296 -> 0 bytes
-rw-r--r--models/GradientBoostingRegressor_num_comments.pklbin92316 -> 0 bytes
-rw-r--r--models/GradientBoostingRegressor_ups.pklbin83737 -> 0 bytes
-rw-r--r--models/KNeighborsRegressor_num_comments.pklbin3254160 -> 0 bytes
-rw-r--r--models/KNeighborsRegressor_ups.pklbin3254160 -> 0 bytes
-rw-r--r--models/LinearRegression_num_comments.pklbin264030 -> 0 bytes
-rw-r--r--models/LinearRegression_ups.pklbin264030 -> 0 bytes
-rw-r--r--models/RandomForestRegressor_num_comments.pklbin2977472 -> 0 bytes
-rw-r--r--models/RandomForestRegressor_ups.pklbin3043136 -> 0 bytes
-rw-r--r--models/RidgeCV_num_comments.pklbin264158 -> 0 bytes
-rw-r--r--models/RidgeCV_ups.pklbin264158 -> 0 bytes
-rw-r--r--project.ipynb150
-rw-r--r--src/app.py33
-rw-r--r--src/helpers/__init__.py2
-rw-r--r--src/helpers/database_handler.py15
-rw-r--r--src/helpers/preprocessor.py95
-rw-r--r--src/helpers/subreddits.py2
-rw-r--r--src/windows/data_fetcher.py88
-rw-r--r--src/windows/modeltrainer.py217
-rw-r--r--src/windows/plotviewer.py266
24 files changed, 918 insertions, 82 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fbeee53
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,132 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+reddit.csv
+models
diff --git a/models/DecisionTreeRegressor_num_comments.pkl b/models/DecisionTreeRegressor_num_comments.pkl
deleted file mode 100644
index bc485a2..0000000
--- a/models/DecisionTreeRegressor_num_comments.pkl
+++ /dev/null
Binary files differ
diff --git a/models/DecisionTreeRegressor_ups.pkl b/models/DecisionTreeRegressor_ups.pkl
deleted file mode 100644
index f9b2516..0000000
--- a/models/DecisionTreeRegressor_ups.pkl
+++ /dev/null
Binary files differ
diff --git a/models/DummyRegressor_num_comments.pkl b/models/DummyRegressor_num_comments.pkl
deleted file mode 100644
index 5878112..0000000
--- a/models/DummyRegressor_num_comments.pkl
+++ /dev/null
Binary files differ
diff --git a/models/DummyRegressor_ups.pkl b/models/DummyRegressor_ups.pkl
deleted file mode 100644
index 14b75af..0000000
--- a/models/DummyRegressor_ups.pkl
+++ /dev/null
Binary files differ
diff --git a/models/GradientBoostingRegressor_num_comments.pkl b/models/GradientBoostingRegressor_num_comments.pkl
deleted file mode 100644
index 998cb3f..0000000
--- a/models/GradientBoostingRegressor_num_comments.pkl
+++ /dev/null
Binary files differ
diff --git a/models/GradientBoostingRegressor_ups.pkl b/models/GradientBoostingRegressor_ups.pkl
deleted file mode 100644
index 3dacc53..0000000
--- a/models/GradientBoostingRegressor_ups.pkl
+++ /dev/null
Binary files differ
diff --git a/models/KNeighborsRegressor_num_comments.pkl b/models/KNeighborsRegressor_num_comments.pkl
deleted file mode 100644
index 54de772..0000000
--- a/models/KNeighborsRegressor_num_comments.pkl
+++ /dev/null
Binary files differ
diff --git a/models/KNeighborsRegressor_ups.pkl b/models/KNeighborsRegressor_ups.pkl
deleted file mode 100644
index bf5c42a..0000000
--- a/models/KNeighborsRegressor_ups.pkl
+++ /dev/null
Binary files differ
diff --git a/models/LinearRegression_num_comments.pkl b/models/LinearRegression_num_comments.pkl
deleted file mode 100644
index a364898..0000000
--- a/models/LinearRegression_num_comments.pkl
+++ /dev/null
Binary files differ
diff --git a/models/LinearRegression_ups.pkl b/models/LinearRegression_ups.pkl
deleted file mode 100644
index 368237f..0000000
--- a/models/LinearRegression_ups.pkl
+++ /dev/null
Binary files differ
diff --git a/models/RandomForestRegressor_num_comments.pkl b/models/RandomForestRegressor_num_comments.pkl
deleted file mode 100644
index f316b3d..0000000
--- a/models/RandomForestRegressor_num_comments.pkl
+++ /dev/null
Binary files differ
diff --git a/models/RandomForestRegressor_ups.pkl b/models/RandomForestRegressor_ups.pkl
deleted file mode 100644
index 2349660..0000000
--- a/models/RandomForestRegressor_ups.pkl
+++ /dev/null
Binary files differ
diff --git a/models/RidgeCV_num_comments.pkl b/models/RidgeCV_num_comments.pkl
deleted file mode 100644
index 1837b0c..0000000
--- a/models/RidgeCV_num_comments.pkl
+++ /dev/null
Binary files differ
diff --git a/models/RidgeCV_ups.pkl b/models/RidgeCV_ups.pkl
deleted file mode 100644
index 58fe2e0..0000000
--- a/models/RidgeCV_ups.pkl
+++ /dev/null
Binary files differ
diff --git a/project.ipynb b/project.ipynb
index 7e3ffd4..c1fa63a 100644
--- a/project.ipynb
+++ b/project.ipynb
@@ -11,7 +11,7 @@
},
{
"cell_type": "code",
- "execution_count": 1104,
+ "execution_count": 1,
"metadata": {},
"outputs": [
{
@@ -31,7 +31,7 @@
"True"
]
},
- "execution_count": 1104,
+ "execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
@@ -56,7 +56,7 @@
},
{
"cell_type": "code",
- "execution_count": 1105,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -66,7 +66,7 @@
},
{
"cell_type": "code",
- "execution_count": 1106,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -98,7 +98,7 @@
" after = None\n",
" downloaded = 0\n",
"\n",
- " def __init__(self, posts_per_subreddit=1000):\n",
+ " def __init__(self, posts_per_subreddit=100):\n",
" client_id = 'dog1LGxsD9M3bXtglOzKsQ'\n",
" client_secret = 'nc-HHPBGtz51-_r4vNLGcuCHmT39Lw'\n",
" username = 'NoSeason1949'\n",
@@ -141,13 +141,13 @@
" return self.posts\n",
"\n",
"# Fetch Data\n",
- "data_fetcher = DataFetcher(posts_per_subreddit=1000)\n",
+ "data_fetcher = DataFetcher(posts_per_subreddit=100)\n",
"data_fetcher.fetch()\n"
]
},
{
"cell_type": "code",
- "execution_count": 1107,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -182,6 +182,7 @@
" <th>title</th>\n",
" <th>link_flair_richtext</th>\n",
" <th>...</th>\n",
+ " <th>post_hint</th>\n",
" <th>preview</th>\n",
" <th>media_metadata</th>\n",
" <th>url_overridden_by_dest</th>\n",
@@ -190,7 +191,6 @@
" <th>crosspost_parent</th>\n",
" <th>is_gallery</th>\n",
" <th>gallery_data</th>\n",
- " <th>poll_data</th>\n",
" <th>call_to_action</th>\n",
" </tr>\n",
" </thead>\n",
@@ -199,14 +199,14 @@
" <th>0</th>\n",
" <td>None</td>\n",
" <td>Python</td>\n",
- " <td>Tell /r/python what you're working on this wee...</td>\n",
- " <td>t2_145f96</td>\n",
+ " <td>It's December, which means it's time for [Adve...</td>\n",
+ " <td>t2_9iikd</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
- " <td>Sunday Daily Thread: What's everyone working o...</td>\n",
- " <td>[{'e': 'text', 't': 'Daily Thread'}]</td>\n",
+ " <td>/r/Python's 2022 Advent of Code</td>\n",
+ " <td>[{'e': 'text', 't': 'News'}]</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@@ -223,16 +223,16 @@
" <th>1</th>\n",
" <td>None</td>\n",
" <td>Python</td>\n",
- " <td>Have some burning questions on advanced Python...</td>\n",
+ " <td>Discussion of using Python in a professional e...</td>\n",
" <td>t2_145f96</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
- " <td>Tuesday Daily Thread: Advanced questions</td>\n",
+ " <td>Thursday Daily Thread: Python Careers, Courses...</td>\n",
" <td>[{'e': 'text', 't': 'Daily Thread'}]</td>\n",
" <td>...</td>\n",
- " <td>{'images': [{'source': {'url': 'https://extern...</td>\n",
+ " <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@@ -247,15 +247,16 @@
" <th>2</th>\n",
" <td>None</td>\n",
" <td>Python</td>\n",
- " <td>You might think that's a minor change, but [ne...</td>\n",
- " <td>t2_7l8dn5ub</td>\n",
+ " <td>I just published the [Python Data Science Dece...</td>\n",
+ " <td>t2_4ei3rm7y</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
- " <td>Flake8 took down the gitlab repository in favo...</td>\n",
- " <td>[{'e': 'text', 't': 'News'}]</td>\n",
+ " <td>Python Data Science December</td>\n",
+ " <td>[{'e': 'text', 't': 'Beginner Showcase'}]</td>\n",
" <td>...</td>\n",
+ " <td>self</td>\n",
" <td>{'images': [{'source': {'url': 'https://extern...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@@ -265,23 +266,22 @@
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
- " <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>None</td>\n",
" <td>Python</td>\n",
- " <td>It's still a POC at this point as I am explori...</td>\n",
- " <td>t2_uakheq67</td>\n",
+ " <td>Any library/framework (pygame, kivy, Ursina, e...</td>\n",
+ " <td>t2_x23yv</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
- " <td>I made a CLI tool that helps you stay up to da...</td>\n",
- " <td>[{'e': 'text', 't': 'Beginner Showcase'}]</td>\n",
+ " <td>Any open source games written in Python? Ideal...</td>\n",
+ " <td>[{'e': 'text', 't': 'Discussion'}]</td>\n",
" <td>...</td>\n",
- " <td>{'images': [{'source': {'url': 'https://extern...</td>\n",
- " <td>{'bl1t2pheqxz91': {'status': 'valid', 'e': 'Re...</td>\n",
+ " <td>NaN</td>\n",
+ " <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@@ -295,18 +295,18 @@
" <th>4</th>\n",
" <td>None</td>\n",
" <td>Python</td>\n",
- " <td></td>\n",
- " <td>t2_110daa</td>\n",
+ " <td>Im super excited about this. I have been leari...</td>\n",
+ " <td>t2_64896tmr</td>\n",
" <td>False</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
- " <td>How to Create Python Packages for Sharing Code...</td>\n",
- " <td>[{'e': 'text', 't': 'Tutorial'}]</td>\n",
+ " <td>I made a program that takes a signal from a po...</td>\n",
+ " <td>[{'e': 'text', 't': 'Beginner Showcase'}]</td>\n",
" <td>...</td>\n",
+ " <td>self</td>\n",
" <td>{'images': [{'source': {'url': 'https://extern...</td>\n",
" <td>NaN</td>\n",
- " <td>https://youtube.com/watch?v=fT-3V8t01DE&amp;amp;fe...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@@ -317,7 +317,7 @@
" </tr>\n",
" </tbody>\n",
"</table>\n",
- "<p>5 rows × 118 columns</p>\n",
+ "<p>5 rows × 117 columns</p>\n",
"</div>"
],
"text/plain": [
@@ -329,11 +329,11 @@
"4 None Python \n",
"\n",
" selftext author_fullname saved \\\n",
- "0 Tell /r/python what you're working on this wee... t2_145f96 False \n",
- "1 Have some burning questions on advanced Python... t2_145f96 False \n",
- "2 You might think that's a minor change, but [ne... t2_7l8dn5ub False \n",
- "3 It's still a POC at this point as I am explori... t2_uakheq67 False \n",
- "4 t2_110daa False \n",
+ "0 It's December, which means it's time for [Adve... t2_9iikd False \n",
+ "1 Discussion of using Python in a professional e... t2_145f96 False \n",
+ "2 I just published the [Python Data Science Dece... t2_4ei3rm7y False \n",
+ "3 Any library/framework (pygame, kivy, Ursina, e... t2_x23yv False \n",
+ "4 Im super excited about this. I have been leari... t2_64896tmr False \n",
"\n",
" mod_reason_title gilded clicked \\\n",
"0 None 0 False \n",
@@ -343,58 +343,44 @@
"4 None 0 False \n",
"\n",
" title \\\n",
- "0 Sunday Daily Thread: What's everyone working o... \n",
- "1 Tuesday Daily Thread: Advanced questions \n",
- "2 Flake8 took down the gitlab repository in favo... \n",
- "3 I made a CLI tool that helps you stay up to da... \n",
- "4 How to Create Python Packages for Sharing Code... \n",
- "\n",
- " link_flair_richtext ... \\\n",
- "0 [{'e': 'text', 't': 'Daily Thread'}] ... \n",
- "1 [{'e': 'text', 't': 'Daily Thread'}] ... \n",
- "2 [{'e': 'text', 't': 'News'}] ... \n",
- "3 [{'e': 'text', 't': 'Beginner Showcase'}] ... \n",
- "4 [{'e': 'text', 't': 'Tutorial'}] ... \n",
- "\n",
- " preview \\\n",
- "0 NaN \n",
- "1 {'images': [{'source': {'url': 'https://extern... \n",
- "2 {'images': [{'source': {'url': 'https://extern... \n",
- "3 {'images': [{'source': {'url': 'https://extern... \n",
- "4 {'images': [{'source': {'url': 'https://extern... \n",
+ "0 /r/Python's 2022 Advent of Code \n",
+ "1 Thursday Daily Thread: Python Careers, Courses... \n",
+ "2 Python Data Science December \n",
+ "3 Any open source games written in Python? Ideal... \n",
+ "4 I made a program that takes a signal from a po... \n",
"\n",
- " media_metadata \\\n",
- "0 NaN \n",
- "1 NaN \n",
- "2 NaN \n",
- "3 {'bl1t2pheqxz91': {'status': 'valid', 'e': 'Re... \n",
- "4 NaN \n",
+ " link_flair_richtext ... post_hint \\\n",
+ "0 [{'e': 'text', 't': 'News'}] ... NaN \n",
+ "1 [{'e': 'text', 't': 'Daily Thread'}] ... NaN \n",
+ "2 [{'e': 'text', 't': 'Beginner Showcase'}] ... self \n",
+ "3 [{'e': 'text', 't': 'Discussion'}] ... NaN \n",
+ "4 [{'e': 'text', 't': 'Beginner Showcase'}] ... self \n",
"\n",
- " url_overridden_by_dest author_cakeday \\\n",
- "0 NaN NaN \n",
- "1 NaN NaN \n",
- "2 NaN NaN \n",
- "3 NaN NaN \n",
- "4 https://youtube.com/watch?v=fT-3V8t01DE&amp;fe... NaN \n",
+ " preview media_metadata \\\n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 {'images': [{'source': {'url': 'https://extern... NaN \n",
+ "3 NaN NaN \n",
+ "4 {'images': [{'source': {'url': 'https://extern... NaN \n",
"\n",
- " crosspost_parent_list crosspost_parent is_gallery gallery_data poll_data \\\n",
- "0 NaN NaN NaN NaN NaN \n",
- "1 NaN NaN NaN NaN NaN \n",
- "2 NaN NaN NaN NaN NaN \n",
- "3 NaN NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN NaN \n",
+ " url_overridden_by_dest author_cakeday crosspost_parent_list \\\n",
+ "0 NaN NaN NaN \n",
+ "1 NaN NaN NaN \n",
+ "2 NaN NaN NaN \n",
+ "3 NaN NaN NaN \n",
+ "4 NaN NaN NaN \n",
"\n",
- " call_to_action \n",
- "0 NaN \n",
- "1 NaN \n",
- "2 NaN \n",
- "3 NaN \n",
- "4 NaN \n",
+ " crosspost_parent is_gallery gallery_data call_to_action \n",
+ "0 NaN NaN NaN NaN \n",
+ "1 NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN \n",
+ "4 NaN NaN NaN NaN \n",
"\n",
- "[5 rows x 118 columns]"
+ "[5 rows x 117 columns]"
]
},
- "execution_count": 1107,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
diff --git a/src/app.py b/src/app.py
new file mode 100644
index 0000000..bb82b4d
--- /dev/null
+++ b/src/app.py
@@ -0,0 +1,33 @@
+import tkinter
+import tkinter.messagebox
+
+import pandas as pd
+
+from helpers.database_handler import DatabaseHandler
+from helpers.preprocessor import Preprocessor
+from windows.data_fetcher import DataDownloader
+from windows.plotviewer import PlotViewer
+
+
+def fetch_data():
+ downloader = DataDownloader()
+ downloader.start()
+ return downloader.posts
+
+if DatabaseHandler().read().empty:
+ # show a message box
+ response = tkinter.messagebox.askokcancel('No Data Found', 'No data found in database. Do you want to fetch data from Reddit?', icon='warning')
+ if response:
+ posts = fetch_data()
+ # ask if user wants to save the data
+ response = tkinter.messagebox.askokcancel('Save Data', 'Do you want to save the data for future use?', icon='warning')
+ posts = pd.DataFrame(posts)
+ posts = Preprocessor(posts).get_preprocessed_data()
+ if response:
+ DatabaseHandler().write(posts)
+else:
+ posts = DatabaseHandler().read()
+
+if __name__ == '__main__':
+ plot_viewer = PlotViewer(posts)
+ plot_viewer.mainloop()
diff --git a/src/helpers/__init__.py b/src/helpers/__init__.py
new file mode 100644
index 0000000..93f7c10
--- /dev/null
+++ b/src/helpers/__init__.py
@@ -0,0 +1,2 @@
+from .database_handler import DatabaseHandler
+from .preprocessor import Preprocessor
diff --git a/src/helpers/database_handler.py b/src/helpers/database_handler.py
new file mode 100644
index 0000000..af86e6e
--- /dev/null
+++ b/src/helpers/database_handler.py
@@ -0,0 +1,15 @@
+import pandas as pd
+
+
+class DatabaseHandler:
+
+ def write(self, df):
+ # save the data to csv
+ df.to_csv('reddit.csv', index=False)
+
+ def read(self):
+ try:
+ df = pd.read_csv('reddit.csv')
+ return df
+ except FileNotFoundError:
+ return pd.DataFrame() \ No newline at end of file
diff --git a/src/helpers/preprocessor.py b/src/helpers/preprocessor.py
new file mode 100644
index 0000000..692e036
--- /dev/null
+++ b/src/helpers/preprocessor.py
@@ -0,0 +1,95 @@
+import pandas as pd
+
+
+class Preprocessor:
+ def __init__(self, dataframe):
+ self.df = dataframe
+ # Finding Saturated Columns – Columns with same values in all rows
+ saturated_cols = []
+ for col in self.df.columns:
+ first_value = self.df[col].iloc[0]
+ if self.df[col].equals(pd.Series([first_value] * len(self.df[col]))):
+ saturated_cols.append(col)
+
+ # At this point, we can drop the saturated columns as they don't provide any useful information
+ self.df.drop(saturated_cols, axis=1, inplace=True)
+
+ # Replace all NaN values with 0 if the column is numeric or empty string if the column is string
+ for col in self.df.columns:
+ if self.df[col].dtype == 'float64' or self.df[col].dtype == 'int64':
+ self.df[col].fillna(0, inplace=True)
+ if self.df[col].dtype == 'object':
+ self.df[col].fillna('', inplace=True)
+
+ # Replace all NaN values with 0 if the column is numeric or empty string if the column is string
+ for col in self.df.columns:
+ if self.df[col].dtype == 'float64' or self.df[col].dtype == 'int64':
+ self.df[col].fillna(0, inplace=True)
+ if self.df[col].dtype == 'object':
+ self.df[col].fillna('', inplace=True)
+
+ # Convert column to string if column is not numeric or boolean
+ for col in self.df.columns:
+ if self.df[col].dtype != 'float64' and self.df[col].dtype != 'int64' and self.df[col].dtype != 'bool':
+ self.df[col] = self.df[col].astype(str)
+
+ # Check for title duplicates
+ print('Duplicate titles: {}'.format(self.df['title'].duplicated().sum()))
+
+ # Same post data can be repeated from the API -Delete titles that appear more than once
+ self.df.drop_duplicates(subset=['title'], keep='first', inplace=True)
+
+ # Find all columns that contain 'flair'
+ columns = list(self.df.columns)
+ flair_columns = self.search(columns, 'flair')
+ # remove everything from df columns except link_flair_text and author_flair_text
+ flair_columns = list(filter(lambda x: x not in ['link_flair_text', 'author_flair_text'], flair_columns))
+ self.df.drop(flair_columns, axis=1, inplace=True)
+
+ # Any rows containing [deleted] and [removed] are not useful for our analysis. Find any rows with these values and drop them.
+ columns = list(self.df.columns)
+ for column in columns:
+ self.df = self.df[self.df[column] != '[deleted]']
+ self.df = self.df[self.df[column] != '[removed]']
+
+ # Remove all posts which are polls - where poll_data is not ""
+ try:
+ self.df = self.df[self.df['poll_data'] == '']
+ except:
+ pass
+
+ self.df['created_utc'] = pd.to_datetime(self.df['created_utc'], unit='s')
+ self.df['hour'] = self.df['created_utc'].dt.hour
+ self.df['day'] = self.df['created_utc'].dt.day_name()
+ # self.df.drop('created_utc', axis=1, inplace=True)
+
+ cols_to_keep = ['title', 'selftext', 'link_flair_text', 'subreddit', 'ups', 'num_comments', 'hour', 'day', 'distinguished', 'author_premium', 'subreddit_subscribers', 'author', 'score', 'created_utc', 'upvote_ratio']
+ self.df = self.df[cols_to_keep]
+
+ # "distinguished" coloumn has 2 values - "moderator" and "" - We can convert this to a boolean column
+ self.df['distinguished'] = self.df['distinguished'].apply(lambda x: True if x == 'moderator' else False)
+
+ # Convert author_premium to boolean
+ self.df['author_premium'] = self.df['author_premium'].apply(lambda x: True if x == True else False)
+
+ # Convert title, selftext, link_flair_text, subreddit to string
+ self.df['title'] = self.df['title'].astype(str)
+ self.df['selftext'] = self.df['selftext'].astype(str)
+ self.df['link_flair_text'] = self.df['link_flair_text'].astype(str)
+ self.df['subreddit'] = self.df['subreddit'].astype(str)
+ self.df['day'] = self.df['day'].astype(str)
+ self.df['distinguished'] = self.df['distinguished'].astype(bool)
+ self.df['hour'] = self.df['hour'].astype(int)
+ self.df['ups'] = self.df['ups'].astype(int)
+ self.df['num_comments'] = self.df['num_comments'].astype(int)
+
+ # Supplimentary Column Search Function
+ def search(self, array, search_term):
+ """
+ Returns a list of strings that contain the search term.
+ """
+ return [string for string in array if search_term in string]
+
+
+ def get_preprocessed_data(self):
+ return self.df \ No newline at end of file
diff --git a/src/helpers/subreddits.py b/src/helpers/subreddits.py
new file mode 100644
index 0000000..b6242c1
--- /dev/null
+++ b/src/helpers/subreddits.py
@@ -0,0 +1,2 @@
+# Define Subreddit List
+SUBREDDITS = ['Python', 'datascience', 'javascript', 'linux', 'opensource', 'node', 'programming', 'computerscience', 'webdev', 'statistics', 'MachineLearning', 'compsci', 'java', 'rust', 'typescript'] \ No newline at end of file
diff --git a/src/windows/data_fetcher.py b/src/windows/data_fetcher.py
new file mode 100644
index 0000000..57763a1
--- /dev/null
+++ b/src/windows/data_fetcher.py
@@ -0,0 +1,88 @@
+import time
+import tkinter
+import tkinter.messagebox
+from tkinter import ttk
+
+import customtkinter
+import requests
+
+from helpers.subreddits import SUBREDDITS
+
+
+class DataDownloader:
+ posts = []
+ after = None
+ downloaded = 0
+ start_time = time.time()
+
+ def __init__(self):
+ self.root = tkinter.Tk()
+ self.root.title('Downloading Data - 0%')
+
+ # center the window
+ posx = int(self.root.winfo_screenwidth() / 2 - 250)
+ posy = int(self.root.winfo_screenheight() / 2 - 50)
+ self.root.geometry(f'500x100+{posx}+{posy}')
+
+ self.root.resizable(False, False)
+ self.root.protocol('WM_DELETE_WINDOW', self.on_closing)
+ self.subreddits = SUBREDDITS
+ self.posts_per_subreddit = 100
+
+ self.progress = ttk.Progressbar(self.root, orient='horizontal', length=500, mode='determinate')
+ self.progress['value'] = 0
+ self.progress['maximum'] = 100
+
+ self.download_label = tkinter.Label(self.root, text='Downloading: 0 / {} Posts'.format(len(SUBREDDITS) * self.posts_per_subreddit))
+
+ self.download_label.pack(fill='x', padx=10, pady=10, side= tkinter.TOP, anchor='w')
+ self.progress.pack(fill='x', padx=10, pady=10)
+
+ self.root.bind('<<DownloadComplete>>', lambda e: self.on_closing())
+ client_id = 'dog1LGxsD9M3bXtglOzKsQ'
+ client_secret = 'nc-HHPBGtz51-_r4vNLGcuCHmT39Lw'
+ username = 'NoSeason1949'
+ password = 'Password@1234'
+ auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
+ data = {
+ 'grant_type': 'password',
+ 'username': username,
+ 'password': password,
+ }
+ headers = {'User-Agent': 'RedditTest/0.1 by {}'.format(username)}
+ res = requests.post('https://www.reddit.com/api/v1/access_token',
+ auth=auth, data=data, headers=headers)
+ self.token = res.json()['access_token']
+ self.headers = {**headers, **{'Authorization': f"bearer {self.token}"}}
+
+ def on_closing(self):
+ self.root.destroy()
+
+ def download(self):
+ for subreddit in self.subreddits:
+ subreddit_download = 0
+ while subreddit_download < self.posts_per_subreddit:
+ url = f"https://oauth.reddit.com/r/{subreddit}/hot?limit=100"
+ if self.after:
+ url += f"&after={self.after}"
+ res = requests.get(url, headers=self.headers)
+ if res.status_code == 200:
+ data = res.json()
+ self.after = data['data']['after']
+ for post in data['data']['children']:
+ self.posts.append(post['data'])
+ subreddit_download += 1
+ self.downloaded += 1
+ dwval = (self.downloaded / (self.posts_per_subreddit * len(self.subreddits))) * 100
+ self.progress['value'] = float(dwval) if dwval < 100 else 100
+ self.root.title(f'Downloading Data - {int(self.progress["value"])}%')
+ time_remaining = (time.time() - self.start_time) / (self.downloaded / (self.posts_per_subreddit * len(self.subreddits))) - (time.time() - self.start_time)
+ time_remaining = time.strftime('%H:%M:%S', time.gmtime(time_remaining))
+ self.download_label['text'] = f'Downloading: {self.downloaded} / {self.posts_per_subreddit * len(self.subreddits)} Posts - {time_remaining} Remaining' if self.downloaded < self.posts_per_subreddit * len(self.subreddits) else 'Download Complete'
+ self.root.update()
+
+ self.root.event_generate('<<DownloadComplete>>', when='tail')
+
+ def start(self):
+ self.root.after(0, self.download)
+ self.root.mainloop()
diff --git a/src/windows/modeltrainer.py b/src/windows/modeltrainer.py
new file mode 100644
index 0000000..25b49c9
--- /dev/null
+++ b/src/windows/modeltrainer.py
@@ -0,0 +1,217 @@
+import html
+import os
+import pickle
+import re
+import warnings
+
+import customtkinter
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import SnowballStemmer
+from scipy.sparse import hstack
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.preprocessing import LabelBinarizer
+
+warnings.filterwarnings('ignore')
+nltk.download('stopwords')
+nltk.download('wordnet')
+from string import punctuation
+
+import numpy as np
+from sklearn.dummy import DummyRegressor
+from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
+from sklearn.linear_model import LinearRegression, RidgeCV
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.tree import DecisionTreeRegressor
+
+
+def preprocess(message):
+ stemmer = SnowballStemmer('english')
+ stuff_to_be_removed = list(stopwords.words('english'))+list(punctuation)
+
+ # Convert message to lower case
+ message = str(message)
+ message = message.lower()
+
+ # Remove all the links from the messages
+ message = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
+ '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', message)
+ # Remove all the mentions
+ message =re.sub("(@[A-Za-z0-9_]+)","", message)
+
+ # Remove all the emojis
+ message = re.sub(re.compile("["
+ u"\U0001F600-\U0001F64F" # emoticons
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
+ u"\U00002500-\U00002BEF" # chinese char
+ u"\U00002702-\U000027B0"
+ u"\U00002702-\U000027B0"
+ u"\U000024C2-\U0001F251"
+ u"\U0001f926-\U0001f937"
+ u"\U00010000-\U0010ffff"
+ u"\u2640-\u2642"
+ u"\u2600-\u2B55"
+ u"\u200d"
+ u"\u23cf"
+ u"\u23e9"
+ u"\u231a"
+ u"\ufe0f" # dingbats
+ u"\u3030"
+ "]+", flags=re.UNICODE), '', message)
+
+ # Remove HTML entities
+ message = html.unescape(message)
+
+ # strip blank spaces
+ message = message.strip()
+
+ # Remove all the punctuations
+ message = message.translate(str.maketrans('', '', punctuation))
+
+ # Remove stopwords and perform stemming
+ message = ' '.join([stemmer.stem(word) for word in message.split() if word not in stuff_to_be_removed])
+
+ # Return the message
+ return message
+
+class ModelTrainer(customtkinter.CTkToplevel):
+ model_dir = 'models/'
+
+ def __init__(self, parent, posts):
+ super().__init__(parent)
+ self.parent = parent
+ self.posts = posts # posts is already a dataframe
+ self.features = ['title', 'selftext', 'subreddit', 'distinguished', 'hour', 'day']
+ self.targets = ['ups', 'num_comments']
+ for col in self.posts.columns:
+ if col not in self.features + self.targets:
+ self.posts.drop(col, axis=1, inplace=True)
+
+ self.categorical_features = ['subreddit', 'distinguished', 'hour', 'day']
+ self.posts['text'] = self.posts['title'] + ' ' + self.posts['selftext']
+ self.posts['text'] = self.posts['text'].apply(lambda x: preprocess(x))
+
+ self.text_features = ['text']
+ self.title('Reddit Data Analysis - Building Models')
+ posx = int(self.winfo_screenwidth()/2 - 300)
+ posy = int(self.winfo_screenheight()/2 - 150)
+ self.geometry('600x300+{}+{}'.format(posx, posy))
+ self.resizable(False, False)
+ self.protocol('WM_DELETE_WINDOW', self.disable_event)
+
+ self.updates = customtkinter.CTkTextbox(self, height=300, width=600, state = 'disabled')
+ self.updates.pack(fill='both', expand=True)
+
+ # Create a hash table to store the model objects
+ self.model_hashmap = {
+ "DummyRegressor": DummyRegressor(),
+ "LinearRegression": LinearRegression(),
+ "RidgeCV": RidgeCV(cv=10),
+ "KNeighborsRegressor": KNeighborsRegressor(),
+ "DecisionTreeRegressor": DecisionTreeRegressor(min_samples_split=45, min_samples_leaf=45, random_state = 10),
+ "RandomForestRegressor": RandomForestRegressor(n_jobs=-1, n_estimators=70, min_samples_leaf=10, random_state = 10),
+ "GradientBoostingRegressor": GradientBoostingRegressor(n_estimators=70, max_depth=5)
+ }
+
+ self.start()
+
+ def disable_event(self):
+ pass
+
+ def edit_textbox(self, text, line, type='wait'):
+ emoji = '🕐' if type == 'wait' else '✅'
+ line_next = line + 1
+ line = str(line) + '.0'
+ line_next = str(line_next) + '.0'
+ self.updates.configure(state='normal')
+ if type == 'wait':
+ self.updates.insert(line, emoji + ' ' + text + '...' + '\n\n')
+ else:
+ self.updates.delete(line, line_next)
+ self.updates.insert(line, emoji + ' ' + text + '\n\n')
+ self.updates.configure(state='disabled')
+
+ # scroll to line
+ self.updates.see(line)
+
+ # update the window
+ self.update()
+
+
+ def start(self):
+ self.ups = self.posts['ups']
+ self.num_comments = self.posts['num_comments']
+
+ # select only text, subreddit, link_flair_text, distinguished, hour, day, ups, num_comments
+ self.posts_ups = self.posts[self.categorical_features + self.text_features + ['ups']]
+ self.posts_num_comments = self.posts[self.categorical_features + self.text_features + ['num_comments']]
+ self.tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
+ self.label_binarizer = LabelBinarizer()
+
+ self.edit_textbox('Preparing Data (Upvotes)', 1, 'wait')
+
+ # generate tfidf - label_binarizer for ups
+ self.tfidf_ups = self.tfidf.fit_transform(self.posts_ups['text'])
+ self.category_ups = [self.label_binarizer.fit_transform(self.posts_ups[col]) for col in self.categorical_features]
+ self.category_ups = np.concatenate(self.category_ups, axis=1)
+ self.X_ups = hstack([self.tfidf_ups, self.category_ups])
+ self.y_ups = self.posts_ups['ups']
+
+ # split data into train and test sets
+ self.X_train_ups, self.X_test_ups, self.y_train_ups, self.y_test_ups = train_test_split(self.X_ups, self.y_ups, test_size=0.2, random_state=42)
+
+ self.edit_textbox('Preparing Data (Upvotes)', 1, 'done')
+
+ self.edit_textbox('Preparing Data (Number of Comments)', 3, 'wait')
+
+ # generate tfidf - label_binarizer for num_comments
+ self.tfidf_num_comments = self.tfidf.fit_transform(self.posts_num_comments['text'])
+ self.category_num_comments = [self.label_binarizer.fit_transform(self.posts_num_comments[col]) for col in self.categorical_features]
+ self.category_num_comments = np.concatenate(self.category_num_comments, axis=1)
+ self.X_num_comments = hstack([self.tfidf_num_comments, self.category_num_comments])
+ self.y_num_comments = self.posts_num_comments['num_comments']
+
+ # split data into train and test sets
+ self.X_train_num_comments, self.X_test_num_comments, self.y_train_num_comments, self.y_test_num_comments = train_test_split(self.X_num_comments, self.y_num_comments, test_size=0.2, random_state=42)
+
+ self.edit_textbox('Preparing Data (Number of Comments)', 2, 'done')
+
+ # train models
+ self.train_models()
+
+ # Create a function to save the models
+ def save_model(self, model, model_name):
+ """
+ Saves the model to the models/ directory
+ """
+ if not os.path.exists(self.model_dir):
+ os.mkdir(self.model_dir)
+ with open(self.model_dir + model_name + '.pkl', 'wb') as f:
+ pickle.dump(model, f)
+
+
+ def train_models(self):
+ line_count = 3
+ for model_name, model in self.model_hashmap.items():
+ self.edit_textbox('Training {} for Upvotes'.format(model_name), line_count, 'wait')
+ model.fit(self.X_train_ups, self.y_train_ups)
+ self.save_model(model, model_name + '_ups')
+ self.edit_textbox('Training {} for Upvotes'.format(model_name), line_count, 'done')
+ line_count += 1
+
+ self.edit_textbox('Training {} for Number of Comments'.format(model_name), line_count, 'wait')
+ model.fit(self.X_train_num_comments, self.y_train_num_comments)
+ self.save_model(model, model_name + '_num_comments')
+ self.edit_textbox('Training {} for Number of Comments'.format(model_name), line_count, 'done')
+ line_count += 1
+
+ self.edit_textbox('Training Complete. Models saved to models/ directory. You may now close this window.', line_count, 'done')
+
+ # allow user to close window
+ self.protocol("WM_DELETE_WINDOW", self.enable_close)
+
+ def enable_close(self):
+ self.destroy()
diff --git a/src/windows/plotviewer.py b/src/windows/plotviewer.py
new file mode 100644
index 0000000..da3886e
--- /dev/null
+++ b/src/windows/plotviewer.py
@@ -0,0 +1,266 @@
+import datetime
+import os
+
+import tkinter
+from tkinter import ttk
+
+import customtkinter
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
+from matplotlib.figure import Figure
+
+from helpers.subreddits import SUBREDDITS
+from .modeltrainer import ModelTrainer
+import numpy as np
+
+
+def pretty_number(number):
+ # Convert number to in B, M, K format
+ if number >= 1000000000:
+ return '{:.2f} B'.format(number / 1000000000)
+ elif number >= 1000000:
+ return '{:.2f} M'.format(number / 1000000)
+ elif number >= 1000:
+ return '{:.2f} K'.format(number / 1000)
+ else:
+ return number
+
+# Author Scoring Function
+def author_scores(df):
+ df_author = df[['author', 'score', 'subreddit', 'num_comments', 'upvote_ratio']]
+ df_author['final_score'] = df_author['score'] * df_author['upvote_ratio'] + df_author['num_comments']
+ df_author = df_author.groupby(['author', 'subreddit']).sum()
+ df_author = df_author.reset_index()
+ return df_author
+
+# Plot Viewer Window
+class PlotViewer(customtkinter.CTk):
+ def __init__(self, posts):
+ super().__init__()
+ self.title('Reddit Data Analysis - Plot Viewer')
+ posx = int(self.winfo_screenwidth() / 2 - 600)
+ posy = int(self.winfo_screenheight() / 2 - 400)
+ self.geometry(f'1200x800+{posx}+{posy}')
+ self.posts = posts
+ self.create_tabs()
+
+ def create_tabs(self):
+ self.tabview = customtkinter.CTkTabview(self)
+ self.tabview.add("Posts")
+ self.tabview.add("Subscribers")
+ self.tabview.add("Author Activity")
+ self.tabview.add("Multi-Subreddit Analysis")
+ self.tabview.add("Posts per Day")
+ self.tabview.add("Top 10 Authors")
+ self.tabview.add("Best Time Analysis")
+ self.tabview.add("Scores Boxplot")
+ self.tabview.add("Scores vs Comments")
+ self.tabview.add("View Data / Predictions")
+
+ fig = Figure(figsize=(12, 8), dpi=72)
+ self.posts_plot = fig.add_subplot(111)
+ self.posts_plot.set_title('Number of posts per subreddit')
+ self.posts_plot.set_xlabel('Subreddit')
+ self.posts_plot.set_xticklabels(np.arange(len(SUBREDDITS)), rotation=45)
+ self.posts_plot.set_ylabel('Number of posts')
+ sns.countplot(x='subreddit', data=self.posts, ax=self.posts_plot)
+ for p in self.posts_plot.patches:
+ self.posts_plot.annotate('{:1.0f} posts'.format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
+ ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
+ self.posts_plot.figure.tight_layout()
+ self.posts_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Posts"))
+ self.posts_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1)
+
+ fig = Figure(figsize=(12, 8), dpi=72)
+ self.subscribers_plot = fig.add_subplot(111)
+ self.subscribers_plot.set_title('Number of subscribers per subreddit')
+ self.subscribers_plot.set_xlabel('Subreddit')
+ self.subscribers_plot.set_xticklabels(np.arange(len(SUBREDDITS)), rotation=45)
+ self.subscribers_plot.set_ylabel('Number of subscribers')
+ sns.barplot(x='subreddit', y='subreddit_subscribers', data=self.posts, ax=self.subscribers_plot)
+ for p in self.subscribers_plot.patches:
+ self.subscribers_plot.annotate('{}'.format(pretty_number(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
+ ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
+ self.subscribers_plot.figure.tight_layout()
+ self.subscribers_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Subscribers"))
+ self.subscribers_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1)
+
+ fig = Figure(figsize=(12, 8), dpi=72)
+ self.author_activity_plot = fig.add_subplot(111)
+ self.author_activity_plot.set_title('Authors Posting in multiple Subreddits')
+ n_subreddits = self.posts.groupby('author')['subreddit'].nunique()
+ sns.countplot(x=n_subreddits, palette=sns.color_palette("husl"), ax=self.author_activity_plot)
+ for p in self.author_activity_plot.patches:
+ self.author_activity_plot.annotate('{:1.0f} authors'.format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
+ ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
+ self.author_activity_plot.set_xlabel('Number of Subreddits')
+ self.author_activity_plot.set_ylabel('Number of Authors')
+ self.author_activity_plot.figure.tight_layout()
+ self.author_activity_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Author Activity"))
+ self.author_activity_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1)
+
+ fig = Figure(figsize=(12, 8), dpi=72)
+ n_upvotes = self.posts.groupby('author')['ups'].sum()
+ self.multi_subreddit_plot = fig.add_subplot(111)
+ self.multi_subreddit_plot.set_title('Does posting in multiple subreddits drives more upvotes?')
+ sns.barplot(x=n_subreddits, y=n_upvotes, palette=sns.color_palette("pastel"), ax=self.multi_subreddit_plot)
+ for p in self.multi_subreddit_plot.patches:
+ self.multi_subreddit_plot.annotate('{:1.0f} upvotes'.format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
+ ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
+ self.multi_subreddit_plot.set_xlabel('Number of Subreddits')
+ self.multi_subreddit_plot.set_ylabel('Number of Upvotes')
+ self.multi_subreddit_plot.set_xticks(list(range(0, len(n_subreddits.unique()))), list(map(lambda x: '{} Subreddits'.format(x) if x > 1 else '{} Subreddits'.format(x), list(range(1, len(n_subreddits.unique()) + 1)))))
+ self.multi_subreddit_plot.figure.tight_layout()
+ self.multi_subreddit_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Multi-Subreddit Analysis"))
+ self.multi_subreddit_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1)
+
+
+ ppd_df = self.posts.groupby(['subreddit', 'created_utc']).size().reset_index(name='counts')
+ ppd_df['created_utc'] = pd.to_datetime(ppd_df['created_utc']).dt.date
+ ppd_df = ppd_df.groupby(['subreddit', 'created_utc']).sum().reset_index()
+ ppd_df = ppd_df.pivot(index='created_utc', columns='subreddit', values='counts')
+ ppd_df = ppd_df.fillna(0)
+ last_6M = datetime.date.today() - datetime.timedelta(days=180)
+ ppd_df = ppd_df.loc[ppd_df.index >= last_6M]
+ palette = sns.color_palette("dark6", len(SUBREDDITS))
+
+
+ fig, axes = plt.subplots(5, 3, figsize=(20, 20), dpi=24)
+ fig.suptitle('Number of posts per day per subreddit (Last 6 Months)\n', fontsize=16)
+ fig.subplots_adjust(hspace=0.5, wspace=0.5)
+ for i, subreddit in enumerate(SUBREDDITS):
+ ax = axes[i // 3, i % 3]
+ ax.set_title(subreddit)
+ ax.set_xlabel('Date')
+ ax.set_ylabel('Number of Posts')
+ ax.set_xticklabels(ppd_df.index, rotation=0)
+ sns.lineplot(data=ppd_df[subreddit], ax=ax, color=palette[i])
+ self.ppd_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Posts per Day"))
+ self.ppd_plot.figure.tight_layout()
+ self.ppd_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1)
+
+ top_10_authors_per_subreddit = author_scores(self.posts).groupby('subreddit').apply(lambda x: x.nlargest(10, 'final_score'))
+ top_10_authors_per_subreddit = top_10_authors_per_subreddit.reset_index(drop=True)
+ fig, axes = plt.subplots(5, 3, figsize=(20, 20), dpi=24)
+ fig.suptitle('Top 10 Authors per Subreddit\n', fontsize=16)
+ fig.subplots_adjust(hspace=0.5, wspace=0.5)
+ for i, subreddit in enumerate(SUBREDDITS):
+ ax = axes[i // 3, i % 3]
+ ax.set_title(subreddit)
+ ax.set_xticklabels(axes[i//3, i%3].get_xticklabels(), rotation=30, horizontalalignment='right')
+ sns.barplot(x='author', y='final_score', data=top_10_authors_per_subreddit[top_10_authors_per_subreddit['subreddit'] == subreddit], ax=ax, palette=sns.color_palette("pastel", 10))
+ ax.set_ylabel('Final Score')
+ ax.set_xlabel('')
+ for p in axes[i//3, i%3].patches:
+ axes[i//3, i%3].annotate('{:1.0f}'.format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
+ ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
+ self.top_10_authors_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Top 10 Authors"))
+ self.top_10_authors_plot.figure.tight_layout()
+ self.top_10_authors_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1)
+
+ # Finding the best time to post on each subreddit
+ best_time_df = self.posts[['subreddit', 'created_utc', 'score', 'num_comments']]
+ best_time_df['final_score'] = best_time_df['score'] + best_time_df['num_comments']
+ best_time_df.drop(['score', 'num_comments'], axis=1, inplace=True)
+
+ # Convert the created_utc column to datetime
+ best_time_df['created_utc'] = pd.to_datetime(best_time_df['created_utc'])
+ best_time_df['day'] = best_time_df['created_utc'].dt.day_name()
+ best_time_df['hour'] = best_time_df['created_utc'].dt.hour
+ best_time_df.drop('created_utc', axis=1, inplace=True)
+
+ # Find total engagement per hour
+ best_time_df = best_time_df.groupby(['subreddit', 'day', 'hour']).sum()
+ best_time_df = best_time_df.reset_index()
+ days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
+ best_time_df['day'] = pd.Categorical(best_time_df['day'], categories=days, ordered=True)
+
+ # Plotting the best time to post on each subreddit
+ fig, axes = plt.subplots(5, 3, figsize=(20, 20), dpi=24)
+ fig.suptitle('Best Time to Post on Each Subreddit\n', fontsize=20)
+ for i, subreddit in enumerate(best_time_df['subreddit'].unique()):
+ sns.lineplot(x='hour', y='final_score', hue='day', data=best_time_df[best_time_df['subreddit'] == subreddit], ax=axes[i//3, i%3], palette=sns.color_palette("husl", 7))
+ axes[i//3, i%3].set_title(subreddit)
+ axes[i//3, i%3].set_xticks(range(0, 24))
+ axes[i//3, i%3].set_xticklabels(list(map(lambda x: (f'0{x}:00' if x < 10 else f'{x}:00'), list(range(0, 24)))), rotation=45, horizontalalignment='right')
+ axes[i//3, i%3].set_xlabel('Time of Day')
+ axes[i//3, i%3].set_ylabel('Total Engagement')
+ self.best_time_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Best Time Analysis"))
+ self.best_time_plot.figure.tight_layout()
+ self.best_time_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1)
+
+ fig = Figure(figsize=(12, 8), dpi=72)
+ self.scores_boxplot = fig.add_subplot(111)
+ sns.boxplot(x='subreddit', y='score', data=self.posts, ax=self.scores_boxplot)
+ self.scores_boxplot.set_title('Boxplot of Scores in Each Subreddit')
+ self.scores_boxplot.set_xlabel('Subreddit')
+ self.scores_boxplot.set_ylabel('Score')
+ self.scores_boxplot = FigureCanvasTkAgg(fig, self.tabview.tab("Scores Boxplot"))
+ self.scores_boxplot.figure.tight_layout()
+ self.scores_boxplot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1)
+
+ # Scatterplot of the scores and number of comments in each subreddit
+ fig, axes = plt.subplots(5, 3, figsize=(20, 20), dpi=24)
+ fig.suptitle('Scatterplot of Scores and Number of Comments in Each Subreddit\n', fontsize=20)
+ palette=sns.color_palette("deep", 15)
+ for i, subreddit in enumerate(self.posts['subreddit'].unique()):
+ sns.scatterplot(x='score', y='num_comments', data=self.posts[self.posts['subreddit'] == subreddit], ax=axes[i//3, i%3], color=palette[i])
+ axes[i//3, i%3].set_title(subreddit)
+ axes[i//3, i%3].set_xlabel('Score')
+ axes[i//3, i%3].set_ylabel('Number of Comments')
+ self.scores_comments_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Scores vs Comments"))
+ self.scores_comments_plot.figure.tight_layout()
+ self.scores_comments_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1)
+
+
+ # View Data / Predictions tab
+ # show the posts dataframe in a table
+ self.posts_table = ttk.Treeview(self.tabview.tab("View Data / Predictions"))
+ self.posts_table.pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1)
+ self.posts_table['columns'] = list(self.posts.columns)
+ for column in self.posts_table['columns']:
+ self.posts_table.column(column, anchor='w')
+ self.posts_table.heading(column, text=column, anchor='w')
+
+ # hide the first column (index)
+ self.posts_table.column('#0', width=0, stretch=tkinter.NO)
+
+ for i, row in self.posts.iterrows():
+ if i < 100:
+ self.posts_table.insert('', 'end', values=list(row))
+
+ if not os.path.exists('models') or len(os.listdir('models')) == 0:
+ try:
+ os.mkdir('models')
+ except:
+ pass
+ self.models_label = customtkinter.CTkLabel(self.tabview.tab("View Data / Predictions"), text="No models found. Please train the models first.", pady= 10)
+ self.models_label.pack()
+ self.models_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Train Models", command=self.train_models)
+ self.models_button.pack()
+ else:
+ self.models_label = customtkinter.CTkLabel(self.tabview.tab("View Data / Predictions"), text="Models found. Predict by entering data on the next screen.", pady= 10)
+ self.models_label.pack()
+ self.models_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Predict")
+ self.models_button.pack()
+
+ self.tabview.pack(expand=True, fill='both')
+
+ def train_models(self):
+ # open model training child window
+ mt = ModelTrainer(self, self.posts)
+ mt.grab_set()
+ mt.focus_set()
+ self.wait_window(mt)
+
+ self.models_label.destroy()
+ self.models_button.destroy()
+ self.models_label = customtkinter.CTkLabel(self.tabview.tab("View Data / Predictions"), text="Models found. Predict by entering data on the next screen.", pady= 10)
+ self.models_label.pack()
+ self.models_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Predict")
+ self.models_button.pack()
+
+