From aa0b936605ce6e5d506fede1346e5ed46c44a1f1 Mon Sep 17 00:00:00 2001 From: Randy Olson Date: Sun, 3 Mar 2013 17:41:34 -0500 Subject: [PATCH 1/3] Refactor global options to local options. Global options moved to main. All functions now take whatever options they need as parameters. --- word_freqs.py | 116 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 83 insertions(+), 33 deletions(-) diff --git a/word_freqs.py b/word_freqs.py index 71435d4..42760b0 100755 --- a/word_freqs.py +++ b/word_freqs.py @@ -44,12 +44,12 @@ # put words here that you don't want to include in the word cloud excludedWords = ["/", "--", "...", "deleted", ")x"] -# Global Variable Initialization -options = None - def parse_cmd_line(): - # command-line argument parsing + """ + command-line argument parsing + """ + usage = ("usage: %prog [options] USERNAME TARGET\n\n" "USERNAME sets your Reddit username for the bot\n" "TARGET sets the subreddit or user to count word frequencies for." @@ -89,7 +89,6 @@ def parse_cmd_line(): "selftext, comment body) rather than incrementing" "the total for for each instance.")) - global options options, args = parser.parse_args() if len(args) != 2: @@ -106,11 +105,21 @@ def parse_cmd_line(): if options.period not in ["day", "week", "month", "year", "all"]: parser.error("Invalid period.") - return user, target + return user, target, options, args -def parseText(text): - """Parse the passed in text and add words that are not common.""" +def parseText(text, count_word_freqs, max_threshold): + """ + Parse the passed in text and add words that are not common. + + :arg `count_word_freqs`: only count a word once per text block (title, + selftext, comment body) rather than incrementing + the total for for each instance. + + :arg `max_threshold`: maximum relative frequency in the text a word can + appear to be considered in word counts. + prevents word spamming in a single submission. + """ total = 0.0 # intentionally a float text_words = defaultdict(int) for word in text.split(): # Split on all whitespace @@ -121,50 +130,85 @@ def parseText(text): # Add to popularWords list for word, count in text_words.items(): - if count / total <= options.max_threshold: - if options.count_word_freqs: + if count / total <= max_threshold: + if count_word_freqs: popularWords[word] += count else: popularWords[word] += 1 -def processRedditor(redditor): - """Parse submissions and comments for the given Redditor.""" - for entry in with_status(redditor.get_overview(limit=options.limit)): +def processRedditor(redditor, limit, count_word_freqs, max_threshold): + """ + Parse submissions and comments for the given Redditor. + + :arg `limit`: the maximum number of submissions to scrape from the subreddit + + :arg `count_word_freqs`: only count a word once per text block (title, + selftext, comment body) rather than incrementing + the total for for each instance. + + :arg `max_threshold`: maximum relative frequency in the text a word can + appear to be considered in word counts. + prevents word spamming in a single submission. + + """ + for entry in with_status(redditor.get_overview(limit=limit)): if isinstance(entry, praw.objects.Comment): # Parse comment - parseText(entry.body) + parseText(entry.body, count_word_freqs, max_threshold) else: # Parse submission - processSubmission(entry, include_comments=False) + processSubmission(entry, count_word_freqs, max_threshold, include_comments=False) -def processSubmission(submission, include_comments=True): - """Parse a submission's text and body (if applicable). - - Include the submission's comments when `include_comments` is True. +def processSubmission(submission, count_word_freqs, max_threshold, include_comments=True): + """ + Parse a submission's text and body (if applicable). + + :arg `count_word_freqs`: only count a word once per text block (title, + selftext, comment body) rather than incrementing + the total for for each instance. + + :arg `max_threshold`: maximum relative frequency in the text a word can + appear to be considered in word counts. + prevents word spamming in a single submission. + + :arg `include_comments`: include the submission's comments when True """ if include_comments: # parse all the comments for the submission submission.replace_more_comments() for comment in praw.helpers.flatten_tree(submission.comments): - parseText(comment.body) + parseText(comment.body, count_word_freqs, max_threshold) # parse the title of the submission - parseText(submission.title) + parseText(submission.title, count_word_freqs, max_threshold) # parse the selftext of the submission (if applicable) if submission.is_self: - parseText(submission.selftext) + parseText(submission.selftext, count_word_freqs, max_threshold) -def processSubreddit(subreddit): - """Parse comments, title text, and selftext in a given subreddit.""" +def processSubreddit(subreddit, period, limit, count_word_freqs, max_threshold): + """ + Parse comments, title text, and selftext in a given subreddit. + + :arg `period`: the time period to scrape the subreddit over (day, week, month, etc.) + + :arg `limit`: the maximum number of submissions to scrape from the subreddit + + :arg `count_word_freqs`: only count a word once per text block (title, + selftext, comment body) rather than incrementing + the total for for each instance. + + :arg `max_threshold`: maximum relative frequency in the text a word can + appear to be considered in word counts. + prevents word spamming in a single submission. + """ # determine period to count the words over - params = {'t': options.period} - for submission in with_status(subreddit.get_top(limit=options.limit, - params=params)): + params = {'t': period} + for submission in with_status(subreddit.get_top(limit=limit, params=params)): try: - processSubmission(submission) + processSubmission(submission, count_word_freqs, max_threshold) except HTTPError as exc: sys.stderr.write("\nSkipping submission {0} due to HTTP status {1}" " error. Continuing...\n" @@ -173,7 +217,9 @@ def processSubreddit(subreddit): def with_status(iterable): - """Wrap an iterable outputing '.' for each item (up to 50 a line).""" + """ + Wrap an iterable outputting '.' for each item (up to 100 per line). + """ for i, item in enumerate(iterable): sys.stderr.write('.') sys.stderr.flush() @@ -183,7 +229,9 @@ def with_status(iterable): def main(): - user, target = parse_cmd_line() + + # parse the command-line options and arguments + user, target, options, args = parse_cmd_line() # open connection to Reddit r = praw.Reddit(user_agent="bot by /u/{0}".format(user)) @@ -196,12 +244,14 @@ def main(): target = target[3:] if options.is_subreddit: - processSubreddit(r.get_subreddit(target)) + processSubreddit(r.get_subreddit(target), options.period, options.limit, + options.count_word_freqs, options.max_threshold) else: - processRedditor(r.get_redditor(target)) + processRedditor(r.get_redditor(target), options.limit, + options.count_word_freqs, options.max_threshold) # build a string containing all the words for the word cloud software - output = "" + output = "\n" # open output file to store the output string outFileName = target + ".csv" From 8baa76f95a17bcb2c324cd11b07a81df9544f5c9 Mon Sep 17 00:00:00 2001 From: Randy Olson Date: Sun, 3 Mar 2013 17:55:40 -0500 Subject: [PATCH 2/3] Move tests. Modify tests for new functions. --- tests/test-functions.py => test-functions.py | 7 +++---- tests/test.py => test.py | 0 2 files changed, 3 insertions(+), 4 deletions(-) rename tests/test-functions.py => test-functions.py (91%) rename tests/test.py => test.py (100%) diff --git a/tests/test-functions.py b/test-functions.py similarity index 91% rename from tests/test-functions.py rename to test-functions.py index 4257c56..7ee3e9f 100644 --- a/tests/test-functions.py +++ b/test-functions.py @@ -7,11 +7,10 @@ class TestSequenceFunctions(unittest.TestCase): def setUp(self): - self.user, self.target = wf.parse_cmd_line() wf.popularWords = defaultdict(int) def test_parse_cmd_line(self): - self.user, self.target = wf.parse_cmd_line() + self.user, self.target, options, args = wf.parse_cmd_line() self.assertEqual(self.user, sys.argv[1]) self.assertEqual(self.target, sys.argv[2]) @@ -27,7 +26,7 @@ def test_parseText(self): for word, freq in popularWords.items(): txt += str((word + " ") * freq) - wf.parseText(txt) + wf.parseText(txt, count_word_freqs=True, max_threshold=0.34) self.assertEqual(popularWords, wf.popularWords) # TODO: still need to test: @@ -52,7 +51,7 @@ def test_processSubmission(self): # parse a fixed thread # TODO: make our own test thread sub = r.get_submission(url="http://www.reddit.com/r/pics/comments/92dd8/test_post_please_ignore/") - wf.processSubmission(sub) + wf.processSubmission(sub, count_word_freqs=True, max_threshold=0.34) # only look at the top 10 most-used words in the thread # TODO: look at all words used in thread diff --git a/tests/test.py b/test.py similarity index 100% rename from tests/test.py rename to test.py From 0124f931ed2710f4ce41f2d587e1c581729e908b Mon Sep 17 00:00:00 2001 From: Randy Olson Date: Sun, 3 Mar 2013 17:55:57 -0500 Subject: [PATCH 3/3] Modify all function calls to explicitly state each variable. --- word_freqs.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/word_freqs.py b/word_freqs.py index 42760b0..cdd15af 100755 --- a/word_freqs.py +++ b/word_freqs.py @@ -112,7 +112,7 @@ def parseText(text, count_word_freqs, max_threshold): """ Parse the passed in text and add words that are not common. - :arg `count_word_freqs`: only count a word once per text block (title, + :arg `count_word_freqs`: if False, only count a word once per text block (title, selftext, comment body) rather than incrementing the total for for each instance. @@ -143,7 +143,7 @@ def processRedditor(redditor, limit, count_word_freqs, max_threshold): :arg `limit`: the maximum number of submissions to scrape from the subreddit - :arg `count_word_freqs`: only count a word once per text block (title, + :arg `count_word_freqs`: if False, only count a word once per text block (title, selftext, comment body) rather than incrementing the total for for each instance. @@ -154,16 +154,18 @@ def processRedditor(redditor, limit, count_word_freqs, max_threshold): """ for entry in with_status(redditor.get_overview(limit=limit)): if isinstance(entry, praw.objects.Comment): # Parse comment - parseText(entry.body, count_word_freqs, max_threshold) + parseText(text=entry.body, count_word_freqs=count_word_freqs, + max_threshold=max_threshold) else: # Parse submission - processSubmission(entry, count_word_freqs, max_threshold, include_comments=False) + processSubmission(submission=entry, count_word_freqs=count_word_freqs, + max_threshold=max_threshold, include_comments=False) def processSubmission(submission, count_word_freqs, max_threshold, include_comments=True): """ Parse a submission's text and body (if applicable). - :arg `count_word_freqs`: only count a word once per text block (title, + :arg `count_word_freqs`: if False, only count a word once per text block (title, selftext, comment body) rather than incrementing the total for for each instance. @@ -177,14 +179,17 @@ def processSubmission(submission, count_word_freqs, max_threshold, include_comme if include_comments: # parse all the comments for the submission submission.replace_more_comments() for comment in praw.helpers.flatten_tree(submission.comments): - parseText(comment.body, count_word_freqs, max_threshold) + parseText(text=comment.body, count_word_freqs=count_word_freqs, + max_threshold=max_threshold) # parse the title of the submission - parseText(submission.title, count_word_freqs, max_threshold) + parseText(text=submission.title, count_word_freqs=count_word_freqs, + max_threshold=max_threshold) # parse the selftext of the submission (if applicable) if submission.is_self: - parseText(submission.selftext, count_word_freqs, max_threshold) + parseText(text=submission.selftext, count_word_freqs=count_word_freqs, + max_threshold=max_threshold) def processSubreddit(subreddit, period, limit, count_word_freqs, max_threshold): @@ -195,7 +200,7 @@ def processSubreddit(subreddit, period, limit, count_word_freqs, max_threshold): :arg `limit`: the maximum number of submissions to scrape from the subreddit - :arg `count_word_freqs`: only count a word once per text block (title, + :arg `count_word_freqs`: if False, only count a word once per text block (title, selftext, comment body) rather than incrementing the total for for each instance. @@ -206,9 +211,10 @@ def processSubreddit(subreddit, period, limit, count_word_freqs, max_threshold): # determine period to count the words over params = {'t': period} - for submission in with_status(subreddit.get_top(limit=limit, params=params)): + for submission in with_status(iterable=subreddit.get_top(limit=limit, params=params)): try: - processSubmission(submission, count_word_freqs, max_threshold) + processSubmission(submission=submission, count_word_freqs=count_word_freqs, + max_threshold=max_threshold) except HTTPError as exc: sys.stderr.write("\nSkipping submission {0} due to HTTP status {1}" " error. Continuing...\n" @@ -244,11 +250,13 @@ def main(): target = target[3:] if options.is_subreddit: - processSubreddit(r.get_subreddit(target), options.period, options.limit, - options.count_word_freqs, options.max_threshold) + processSubreddit(subreddit=r.get_subreddit(target), period=options.period, + limit=options.limit, count_word_freqs=options.count_word_freqs, + max_threshold=options.max_threshold) else: - processRedditor(r.get_redditor(target), options.limit, - options.count_word_freqs, options.max_threshold) + processRedditor(redditor=r.get_redditor(target), limit=options.limit, + count_word_freqs=options.count_word_freqs, + max_threshold=options.max_threshold) # build a string containing all the words for the word cloud software output = "\n"