{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Fetching Data\n", "\n", "We are using the Reddit API to fetch data from a bunch of programming, computer science and data science related subreddits." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] /Users/lucifer/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /Users/lucifer/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Import Libraries\n", "import requests\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import warnings\n", "import nltk\n", "import html\n", "import re\n", "from nltk.corpus import stopwords\n", "from nltk.stem import SnowballStemmer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "warnings.filterwarnings('ignore')\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Define Subreddit List\n", "subreddits = ['python', 'datascience', 'javascript', 'linux', 'opensource', 'node', 'programming', 'computerscience', 'webdev', 'statistics', 'MachineLearning', 'compsci', 'java', 'rust', 'typescript']" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading Data |██████████████████████████████████████████████████| 100.00% Complete\r" ] } ], "source": [ "# Progress Bar Display Function\n", "def progress (iteration, total, fill = '█'):\n", " printEnd = \"\\r\"\n", " prefix = 'Downloading Data'\n", " suffix = 'Complete'\n", " decimals = 2\n", " length = 50\n", " downloaded = 100 * (iteration / float(total))\n", " percent = (\"{0:.\" + str(decimals) + \"f}\").format(downloaded) if downloaded <= 100 else 100\n", " filledLength = int(length * iteration // total)\n", " bar = fill * filledLength + '-' * (length - filledLength)\n", " if downloaded <= 100:\n", " print(f'\\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)\n", "\n", "# Data Fetcher Class\n", "class DataFetcher:\n", " posts = []\n", " after = None\n", " downloaded = 0\n", "\n", " def __init__(self, posts_per_subreddit=100):\n", " client_id = 'dog1LGxsD9M3bXtglOzKsQ'\n", " client_secret = 'nc-HHPBGtz51-_r4vNLGcuCHmT39Lw'\n", " username = 'NoSeason1949'\n", " password = 'Password@1234'\n", " auth = requests.auth.HTTPBasicAuth(client_id, client_secret)\n", " data = {\n", " 'grant_type': 'password',\n", " 'username': username,\n", " 'password': password,\n", " }\n", " headers = {'User-Agent': 'RedditTest/0.1 by {}'.format(username)}\n", " res = requests.post('https://www.reddit.com/api/v1/access_token',\n", " auth=auth, data=data, headers=headers)\n", " self.token = res.json()['access_token']\n", " self.headers = {**headers, **{'Authorization': f\"bearer {self.token}\"}}\n", " self.subreddits = subreddits\n", " self.posts_per_subreddit = posts_per_subreddit\n", "\n", " def fetch(self):\n", " for subreddit in self.subreddits:\n", " subreddit_download = 0\n", " while subreddit_download < self.posts_per_subreddit:\n", " url = f\"https://oauth.reddit.com/r/{subreddit}/hot?limit=100\"\n", " if self.after:\n", " url += f\"&after={self.after}\"\n", " res = requests.get(url, headers=self.headers)\n", " if res.status_code == 200:\n", " data = res.json()\n", " for post in data['data']['children']:\n", " self.posts.append(post['data'])\n", " subreddit_download += 1\n", " self.downloaded += 1\n", " progress(self.downloaded, len(subreddits) * self.posts_per_subreddit)\n", " self.after = data['data']['after']\n", " else:\n", " print('Failed on subreddit: {}, with status code: {}'.format(subreddit, res.status_code))\n", " break\n", "\n", " def get_data(self):\n", " return self.posts\n", "\n", "# Fetch Data\n", "data_fetcher = DataFetcher(posts_per_subreddit=100)\n", "data_fetcher.fetch()\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | approved_at_utc | \n", "subreddit | \n", "selftext | \n", "author_fullname | \n", "saved | \n", "mod_reason_title | \n", "gilded | \n", "clicked | \n", "title | \n", "link_flair_richtext | \n", "... | \n", "media_metadata | \n", "post_hint | \n", "url_overridden_by_dest | \n", "preview | \n", "crosspost_parent_list | \n", "crosspost_parent | \n", "is_gallery | \n", "gallery_data | \n", "poll_data | \n", "call_to_action | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "None | \n", "Python | \n", "It's December, which means it's time for [Adve... | \n", "t2_9iikd | \n", "False | \n", "None | \n", "0 | \n", "False | \n", "/r/Python's 2022 Advent of Code | \n", "[{'e': 'text', 't': 'News'}] | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 1 | \n", "None | \n", "Python | \n", "Discussion of using Python in a professional e... | \n", "t2_145f96 | \n", "False | \n", "None | \n", "0 | \n", "False | \n", "Thursday Daily Thread: Python Careers, Courses... | \n", "[{'e': 'text', 't': 'Daily Thread'}] | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 2 | \n", "None | \n", "Python | \n", "I made it with my daughter(11yo) to help with ... | \n", "t2_2zcl2abb | \n", "False | \n", "None | \n", "0 | \n", "False | \n", "Programming Time Card Game - 55 cards with pyt... | \n", "[{'e': 'text', 't': 'Beginner Showcase'}] | \n", "... | \n", "{'tthydothgi3a1': {'status': 'valid', 'e': 'Im... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 3 | \n", "None | \n", "Python | \n", "Hi Reddit community,\\n\\nI am currently trying ... | \n", "t2_xf27c | \n", "False | \n", "None | \n", "0 | \n", "False | \n", "I made a VS Code plugin that brings codex to t... | \n", "[{'e': 'text', 't': 'Intermediate Showcase'}] | \n", "... | \n", "{'zc0qseogmg3a1': {'status': 'valid', 'e': 'Im... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 4 | \n", "None | \n", "Python | \n", "\n", " | t2_hv61g | \n", "False | \n", "None | \n", "0 | \n", "False | \n", "Looking at the Python documentation can be dan... | \n", "[{'e': 'text', 't': 'Discussion'}] | \n", "... | \n", "NaN | \n", "link | \n", "https://twitter.com/marekgibney/status/1598706... | \n", "{'images': [{'source': {'url': 'https://extern... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5 rows × 117 columns
\n", "