Source code for redditparser

import pandas as pd
import praw
import time
import re
import math
import urllib.request
import os
import shutil


[docs]class RedditParser:
    def __init__(self, client_id, client_secret, password, user_agent, username):
        # Authenticate to API
        self.reddit = praw.Reddit(client_id = client_id,
                             client_secret = client_secret,
                             password = password,
                             user_agent = user_agent,
                             username = username)
        # Main data structure
        self.users = []
        # Create directory
        self.setup_imagepath()

[docs]    def query_level(self, level):
        """
        Returns the query level - epoch correspondence

        :param level: query level (str) available levels:
            1day, 1week, 1month, 3months, 6months, 1year, 3year
        :return: Epoch time of query level
        """
        # Init epoch
        epoch = 0
        # Get epoch-level mapping
        if level == '1day':
            epoch = 86400 # 1 day epoch
        elif level == '1week':
            epoch = 604800  # 1 week epoch
        elif level == '1month':
            epoch = 2678400  # 1 month epoch
        elif level == '3months':
            epoch = 8035200  # 3 months epoch
        elif level == '6months':
            epoch = 16070400  # 6 months epoch
        elif level == '1year':
            epoch = 31536000  # 1 year epoch
        elif level == '3years':
            epoch = 94608000  # 3 year epoch
        else:
            epoch = 31536000  # (DEFAULT) 1 year epoch
        # Give the epoch correspondance
        return epoch

[docs]    def setup_imagepath(self, path='data/images'):
        """
        Creates directory if it does not exist, and removes everything

        :param path: Full path to be created
        :return:
        """
        # Remove everything under the images data/image folder
        if os.path.exists(os.path.abspath(path)):
            shutil.rmtree(os.path.abspath(path))
        # Create directory if it does not exist
        if not os.path.exists(os.path.abspath(path)):
            os.makedirs(os.path.abspath(path))

[docs]    def store_media(self, submission, fullpath):
        """
        Stores the preview image under RateMe/data/image without any optimization

        :param submission: Submission object of PRAW
        :param fullpath: Fullpath of image to be saved
        :return: Returns True on successful operation
        """
        # Image optimization later
        # https://cloudinary.com/blog/image_optimization_in_python
        if hasattr(submission, 'preview'):
            submission_preview = submission.preview
        else:
            return False

        if submission_preview:
            media = submission_preview['images'][0]
            url = media['source']['url'] # reddit provide different resize options, pick medium option
            if url:
                try:
                    urllib.request.urlretrieve(url, fullpath)
                    return True
                except:
                    return False
            else:
                return False
        else:
            return False

[docs]    def get_aboutuser(self, username):
        """
        Returns the user additional information about user using about.json of the user

        :param username: Redditor username (str)
        :return: Returns tuple of (created, comment_karma, link_karma, verified, is_gold, is_mod, is_employee)
        """
        # Get the about.json of the user
        try:
            data = self.reddit.get('user/{}/about.json'.format(username))
            created, comment_karma, link_karma, verified_email, is_gold, is_mod, is_employee = data.created_utc,\
                                                                                         data.comment_karma,\
                                                                                         data.link_karma,\
                                                                                         data.has_verified_email,\
                                                                                         data.is_gold,\
                                                                                         data.is_mod,\
                                                                                         data.is_employee
            return created, comment_karma, link_karma, verified_email, is_gold, is_mod, is_employee
        except:
            return '','','','','','',''

[docs]    def get_agegender(self, title):
        """
        Get age and gender from the title if given

        :param title: Submission.title object (str)
        :return: Returns age, gender (str tuple)
        """
        age, gender = '', ''
        # Catch the first 2-digit number
        res = re.findall('(\d{2})', title)
        # Check if there is a match
        if res:
            age = res[0] # get the first occurence
        else:
            # Return immediately
            return '', ''
        # Get gender, check char by char
        for c in title.upper():
            gender = c
            if c == 'M' or c == 'F':
                break
            else:
                gender = ''
        # Return the values
        return age, gender

[docs]    def get_score(self, comments):
        """
        Gets the average attractiveness score from submission

        :param comments: Comments from submission
        :return: Returns tuple of Score (float), Number of Comment count (int)
        """
        score = 0.0
        comment_count = 0
        # Print the top level comments
        comments.replace_more(limit=0)
        for top_c in comments:
            # Skip bot message
            if 'Hi there, thank you for your submission! To be rated on /r/Rateme,' in top_c.body:
                continue
            # Match the rating
            res = re.findall('(\d*\.*\d+)(/10)', top_c.body)
            if res:
                match = res[0] # get the first one, I dont care the rest
                num, denum = res[0]
                denum = denum[1:] # Get rid of slash: /10 -> 10
                # This is just a hack for case .....5/10 -> .
                if ".." in num:
                    num = num.replace('.','')
                    # Regex doesn't catch this: 6....5/10
                    # This is also another hack, get the significand: 6, in this case
                    if len(num) > 1:
                        num = num[0]

                # print('{}   {}'.format(num, denum))
                # Sometimes people say 'you are 13/10', this is for that
                if float(num) < float(denum):
                    comment_count += 1
                    score += float(num)
                    # print(top_c.body)
                    # print('{}   {}'.format(num, denum))
                    # print('------------------------------------------------------------------------------')

        # If there is no comment yet, just give below average
        if comment_count == 0:
            return 0.0, 0
        else:
            return score/comment_count, comment_count

[docs]    def parse_rateme(self, query_level):
        """
        Main runner method for reddit parsing

        :return: Returns number of the people has been parsed (int)
        """
        subreddit = self.reddit.subreddit('Rateme')
        query_level = self.query_level(query_level)
        id = 1
        now = int(time.time())
        for submission in subreddit.submissions(now - query_level, now):
            # Get age and gender from the submisson title
            age, gender = self.get_agegender(submission.title)
            if age == '' or gender == '':
                continue
            # Get the attractiveness score from the comments
            score, comment_count = self.get_score(submission.comments)
            if score == 0.0:
                continue
            # This could have some bugs, I havent checked thoroughly
            try:
                created, comment_karma, link_karma,\
                verified_email, is_gold, is_mod, is_employee = self.get_aboutuser(submission.author)
            except:
                continue
            # Store the media into a folder
            relpath = os.path.join(os.path.relpath('data/images'), str(id) + '.jpg')
            res = self.store_media(submission, relpath)
            if not res:
                continue
            # Show some result
            print('{} {} {} {}/10 comment_karma:{}, link_karma:{}'.format(submission.author,
                                                                          age,
                                                                          gender,
                                                                          math.ceil(score),
                                                                          comment_karma,
                                                                          link_karma))
            # Create a dictionary for user data
            user = {'id': id, 'name': submission.author, 'age': age, 'comment_count': comment_count,
                    'score': format(score, '.3f'), 'gender': gender, 'created_epoch': created,
                    'comment_karma': comment_karma, 'link_karma': link_karma, 'is_gold': is_gold,
                    'is_mod': is_mod, 'is_employee': is_employee, 'verified_email': verified_email,
                    'image_path' : relpath}
            # Save it into the list
            self.users.append(user)
            id += 1
        # Convert into pandas dataframe to save in csv format
        df = pd.DataFrame.from_dict(self.users)
        df.to_csv('data/users.csv',index=False,
                  columns=['id', 'name', 'age', 'gender', 'score', 'comment_count',
                           'created_epoch', 'comment_karma', 'link_karma', 'verified_email',
                           'is_gold', 'is_mod', 'is_employee', 'image_path'])
        print('Total of {} people parsed'.format(id - 1))
        # Return number of people parsed
        return id - 1