reddit_data_scraping.py

# -*- coding: utf-8 -*-
"""Dissertation Reddit data scraping

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1p2qxUqMWM7cx5glRKA1XSHCHm_7XYkT9

\< **Tawakalit Agboola** \>

https://medium.com/@archanakkokate/scraping-reddit-data-using-python-and-praw-a-beginners-guide-7047962f5d29

https://github.com/topics/medical-text-mining

https://jamanetwork.com/journals/jamadermatology/article-abstract/2810837

https://jamanetwork.com/journals/jamadermatology/article-abstract/2810837

https://www.analyticsvidhya.com/blog/2023/02/extracting-medical-information-from-clinical-text-with-nlp/

https://medium.com/@danieljfeller/easy-medical-nlp-in-python-using-the-medspacy-library-4753f4806b6c
"""

!pip install praw
!pip install asyncpraw
!pip install nest_asyncio

import asyncpraw
import asyncio
import praw

import csv
import nest_asyncio
import pandas as pd
from datetime import datetime

import asyncio
import asyncpraw
import pandas as pd
from datetime import datetime
import nest_asyncio
import pickle

# Apply nest_asyncio to enable running in Jupyter
nest_asyncio.apply()

# Load previously seen posts and comments to avoid duplicates
try:
    with open('seen_posts.pkl', 'rb') as f:
        seen_posts = pickle.load(f)
except FileNotFoundError:
    seen_posts = set()

try:
    with open('seen_comments.pkl', 'rb') as f:
        seen_comments = pickle.load(f)
except FileNotFoundError:
    seen_comments = set()

reddit = asyncpraw.Reddit(client_id='',
                          client_secret='A',
                          user_agent='Tawakalit Agboola')

async def scrape_submissions(keyword, limit=10000, delay=10):
    subreddit = await reddit.subreddit("all")
    submissions = subreddit.search(keyword, limit=limit)
    posts = []
    comments = []
    backoff = 1  # Start with 1 second backoff
    MAX_BACKOFF = 60  # Max backoff time in seconds

    async for submission in submissions:
        if submission.id in seen_posts:
            continue  # Skip if we've already seen this post
        seen_posts.add(submission.id)
        try:
            # Ensure the submission is fully fetched
            await submission.load()
            posts.append({
                'Post ID': submission.id,
                'Post Title': submission.title,
                'Post Body': submission.selftext,  # Include the post description (body)
                'Score': submission.score,
                'URL': submission.url,
                'Post Author': str(submission.author),
                'Subreddit': str(submission.subreddit),
                'Post Timestamp': datetime.fromtimestamp(submission.created_utc).isoformat()
            })

            # Properly await the replacement of MoreComments
            await submission.comments.replace_more(limit=0)
            for comment in submission.comments.list():
                if comment.id in seen_comments:
                    continue  # Skip if we've already seen this comment
                seen_comments.add(comment.id)
                comments.append({
                    'Comment ID': comment.id,
                    'Parent ID': comment.parent_id.split('_')[1],
                    'Comment Author': str(comment.author),
                    'Comment Body': comment.body,
                    'Comment Score': comment.score,
                    'Comment Timestamp': datetime.fromtimestamp(comment.created_utc).isoformat(),
                    'Link to Post': f'https://reddit.com{comment.permalink}'
                })
            await asyncio.sleep(delay)  # Delay between processing each submission to avoid rate limits
            backoff = 1  # Reset backoff after successful request
        except asyncpraw.exceptions.RedditAPIException as e:
            if '429' in str(e):  # Rate-limiting error
                print(f"Rate limit hit, sleeping for {backoff} seconds")
                await asyncio.sleep(backoff)
                backoff = min(backoff * 2, MAX_BACKOFF)  # Exponential backoff
            else:
                print(f"API exception occurred: {e}")
                await asyncio.sleep(backoff)
                backoff = min(backoff * 2, MAX_BACKOFF)
        except asyncpraw.exceptions.ClientException as e:
            print(f"Client exception occurred: {e}")
            await asyncio.sleep(backoff)
            backoff = min(backoff * 2, MAX_BACKOFF)
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break

    # Save progress after each batch
    if posts:
        posts_df = pd.DataFrame(posts).drop_duplicates(subset='Post ID')
        posts_df.to_csv('reddit_posts.csv', mode='a', header=False, index=False)  # Append to file
        print(f"{len(posts)} posts scraped and saved.")

    if comments:
        comments_df = pd.DataFrame(comments).drop_duplicates(subset='Comment ID')
        comments_df.to_csv('reddit_comments.csv', mode='a', header=False, index=False)  # Append to file
        print(f"{len(comments)} comments scraped and saved.")

    # Persist seen post and comment IDs to avoid duplicates in future runs
    with open('seen_posts.pkl', 'wb') as f:
        pickle.dump(seen_posts, f)

    with open('seen_comments.pkl', 'wb') as f:
        pickle.dump(seen_comments, f)

async def main(keywords):
    tasks = [scrape_submissions(keyword, limit=10000) for keyword in keywords]  # Reduce limit per keyword
    await asyncio.gather(*tasks)

keywords = [
    'Upadacitinib', 'Rinvoq', 'Abrocitinib', 'Cibinqo', 'Baricitinib', 'Olumiant',
    'Ruxolitinib', 'Opzelura', 'Delgocitinib', 'Corectim', 'Ritlecitinib', 'Litfulo',
    'atopic dermatitis', 'eczema', 'psoriasis', 'vitiligo', 'alopecia areata',
    'JAK inhibitor side effects', "Xeljanz", "Filgotinib", "Deucravacitinib",
    "Jyseleca", "Sotyktu", "Jakavi", "Smyraf", "Inrebic", "Vonjo"
]

# Running the main function within an existing event loop
loop = asyncio.get_event_loop()
loop.run_until_complete(main(keywords))

import os
import pandas as pd

# This is inside the scrape_submissions function
async def scrape_submissions(keyword, limit=10000, delay=10):
    subreddit = await reddit.subreddit("all")
    submissions = subreddit.search(keyword, limit=limit)
    posts = []
    comments = []
    backoff = 1  # Start with 1 second backoff
    MAX_BACKOFF = 60  # Max backoff time in seconds

    async for submission in submissions:
        if submission.id in seen_posts:
            continue  # Skip if we've already seen this post
        seen_posts.add(submission.id)
        try:
            # Ensure the submission is fully fetched
            await submission.load()
            posts.append({
                'Post ID': submission.id,
                'Post Title': submission.title,
                'Post Body': submission.selftext,  # Include the post description (body)
                'Score': submission.score,
                'URL': submission.url,
                'Post Author': str(submission.author),
                'Subreddit': str(submission.subreddit),
                'Post Timestamp': datetime.fromtimestamp(submission.created_utc).isoformat()
            })

            # Properly await the replacement of MoreComments
            await submission.comments.replace_more(limit=0)
            for comment in submission.comments.list():
                if comment.id in seen_comments:
                    continue  # Skip if we've already seen this comment
                seen_comments.add(comment.id)
                comments.append({
                    'Comment ID': comment.id,
                    'Parent ID': comment.parent_id.split('_')[1],
                    'Comment Author': str(comment.author),
                    'Comment Body': comment.body,
                    'Comment Score': comment.score,
                    'Comment Timestamp': datetime.fromtimestamp(comment.created_utc).isoformat(),
                    'Link to Post': f'https://reddit.com{comment.permalink}'
                })
            await asyncio.sleep(delay)  # Delay between processing each submission to avoid rate limits
            backoff = 1  # Reset backoff after successful request
        except asyncpraw.exceptions.RedditAPIException as e:
            if '429' in str(e):  # Rate-limiting error
                print(f"Rate limit hit, sleeping for {backoff} seconds")
                await asyncio.sleep(backoff)
                backoff = min(backoff * 2, MAX_BACKOFF)  # Exponential backoff
            else:
                print(f"API exception occurred: {e}")
                await asyncio.sleep(backoff)
                backoff = min(backoff * 2, MAX_BACKOFF)
        except asyncpraw.exceptions.ClientException as e:
            print(f"Client exception occurred: {e}")
            await asyncio.sleep(backoff)
            backoff = min(backoff * 2, MAX_BACKOFF)
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break

    # Save progress after each batch
    if posts:
        posts_df = pd.DataFrame(posts).drop_duplicates(subset='Post ID')

        # Check if the file already exists
        if not os.path.exists('reddit_posts.csv'):
            posts_df.to_csv('reddit_posts.csv', mode='a', header=True, index=False)  # Write headers if new file
        else:
            posts_df.to_csv('reddit_posts.csv', mode='a', header=False, index=False)  # Append without headers
        print(f"{len(posts)} posts scraped and saved.")

    if comments:
        comments_df = pd.DataFrame(comments).drop_duplicates(subset='Comment ID')

        # Check if the file already exists
        if not os.path.exists('reddit_comments.csv'):
            comments_df.to_csv('reddit_comments.csv', mode='a', header=True, index=False)  # Write headers if new file
        else:
            comments_df.to_csv('reddit_comments.csv', mode='a', header=False, index=False)  # Append without headers
        print(f"{len(comments)} comments scraped and saved.")

    # Persist seen post and comment IDs to avoid duplicates in future runs
    with open('seen_posts.pkl', 'wb') as f:
        pickle.dump(seen_posts, f)

    with open('seen_comments.pkl', 'wb') as f:
        pickle.dump(seen_comments, f)

import asyncio
import asyncpraw
import pandas as pd
from datetime import datetime
import nest_asyncio

# Apply nest_asyncio to enable running in Jupyter
nest_asyncio.apply()

reddit = asyncpraw.Reddit(client_id='qOb9IGvWo3B8qXKBJHCoOg',
                          client_secret='cZf2jdGFr-r6WTDnMGd8Mi8AT31v8A',
                          user_agent='Tawakalit Agboola')

async def scrape_submissions(keyword, limit=100, delay=1):
    subreddit = await reddit.subreddit("all")
    submissions = subreddit.search(keyword, limit=limit)
    posts = []
    comments = []
    backoff = 1  # Start with 1 second backoff

    async for submission in submissions:
        try:
            # Ensure the submission is fully fetched
            await submission.load()
            posts.append({
                'Post ID': submission.id,
                'Title': submission.title,
                'Score': submission.score,
                'URL': submission.url,
                'Post Author': str(submission.author),
                'Subreddit': str(submission.subreddit),
                'Post Timestamp': datetime.fromtimestamp(submission.created_utc).isoformat()
            })

            # Properly await the replacement of MoreComments
            await submission.comments.replace_more(limit=0)
            for comment in submission.comments.list():
                comments.append({
                    'Comment ID': comment.id,
                    'Parent ID': comment.parent_id.split('_')[1],
                    'Comment Author': str(comment.author),
                    'Comment Body': comment.body,
                    'Comment Score': comment.score,
                    'Comment Timestamp': datetime.fromtimestamp(comment.created_utc).isoformat(),
                    'Link to Post': f'https://reddit.com{comment.permalink}'
                })
            await asyncio.sleep(delay)  # Delay between processing each submission to avoid rate limits
            backoff = 1  # Reset backoff after successful request
        except asyncpraw.exceptions.RedditAPIException as e:
            print(f"API exception occurred: {e}")
            await asyncio.sleep(backoff)
            backoff *= 2  # Exponential backoff
        except asyncpraw.exceptions.ClientException as e:
            print(f"Client exception occurred: {e}")
            await asyncio.sleep(backoff)
            backoff *= 2  # Exponential backoff
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break

    return posts, comments

async def main(keywords):
    tasks = [scrape_submissions(keyword) for keyword in keywords]
    results = await asyncio.gather(*tasks)

    # Flatten the list of results
    all_posts = [post for result in results for post in result[0]]
    all_comments = [comment for result in results for comment in result[1]]

    # Check if data is empty before saving
    if all_posts:
        posts_df = pd.DataFrame(all_posts)
        posts_df.to_csv('reddit_posts.csv', index=False)
        print("Posts data has been scraped and saved.")
    else:
        print("No posts data collected.")

    if all_comments:
        comments_df = pd.DataFrame(all_comments)
        comments_df.to_csv('reddit_comments.csv', index=False)
        print("Comments data has been scraped and saved.")
    else:
        print("No comments data collected.")

keywords = [
    'Upadacitinib', 'Rinvoq', 'Abrocitinib', 'Cibinqo', 'Baricitinib', 'Olumiant',
    'Ruxolitinib', 'Opzelura', 'Delgocitinib', 'Corectim', 'Ritlecitinib', 'Litfulo',
    'atopic dermatitis', 'eczema', 'psoriasis', 'vitiligo', 'alopecia areata',
    'JAK inhibitor side effects'
]

# Running the main function within an existing event loop
loop = asyncio.get_event_loop()
loop.run_until_complete(main(keywords))