From aa0b936605ce6e5d506fede1346e5ed46c44a1f1 Mon Sep 17 00:00:00 2001
From: Randy Olson <rso@randalolson.com>
Date: Sun, 3 Mar 2013 17:41:34 -0500
Subject: [PATCH 1/3] Refactor global options to local options.

Global options moved to main. All functions now take whatever options
they need as parameters.
---
 word_freqs.py | 116 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 83 insertions(+), 33 deletions(-)

diff --git a/word_freqs.py b/word_freqs.py
index 71435d4..42760b0 100755
--- a/word_freqs.py
+++ b/word_freqs.py
@@ -44,12 +44,12 @@
 # put words here that you don't want to include in the word cloud
 excludedWords = ["/", "--", "...", "deleted", ")x"]
 
-# Global Variable Initialization
-options = None
-
 
 def parse_cmd_line():
-    # command-line argument parsing
+    """
+        command-line argument parsing
+    """
+    
     usage = ("usage: %prog [options] USERNAME TARGET\n\n"
              "USERNAME sets your Reddit username for the bot\n"
              "TARGET sets the subreddit or user to count word frequencies for."
@@ -89,7 +89,6 @@ def parse_cmd_line():
                             "selftext, comment body) rather than incrementing"
                             "the total for for each instance."))
 
-    global options
     options, args = parser.parse_args()
 
     if len(args) != 2:
@@ -106,11 +105,21 @@ def parse_cmd_line():
     if options.period not in ["day", "week", "month", "year", "all"]:
         parser.error("Invalid period.")
 
-    return user, target
+    return user, target, options, args
 
 
-def parseText(text):
-    """Parse the passed in text and add words that are not common."""
+def parseText(text, count_word_freqs, max_threshold):
+    """
+        Parse the passed in text and add words that are not common.
+        
+        :arg `count_word_freqs`: only count a word once per text block (title,
+                                 selftext, comment body) rather than incrementing
+                                 the total for for each instance.
+        
+        :arg `max_threshold`: maximum relative frequency in the text a word can
+                              appear to be considered in word counts.
+                              prevents word spamming in a single submission.
+    """
     total = 0.0  # intentionally a float
     text_words = defaultdict(int)
     for word in text.split():  # Split on all whitespace
@@ -121,50 +130,85 @@ def parseText(text):
 
     # Add to popularWords list
     for word, count in text_words.items():
-        if count / total <= options.max_threshold:
-            if options.count_word_freqs:
+        if count / total <= max_threshold:
+            if count_word_freqs:
                 popularWords[word] += count
             else:
                 popularWords[word] += 1
 
 
-def processRedditor(redditor):
-    """Parse submissions and comments for the given Redditor."""
-    for entry in with_status(redditor.get_overview(limit=options.limit)):
+def processRedditor(redditor, limit, count_word_freqs, max_threshold):
+    """
+        Parse submissions and comments for the given Redditor.
+        
+        :arg `limit`: the maximum number of submissions to scrape from the subreddit
+        
+        :arg `count_word_freqs`: only count a word once per text block (title,
+                                 selftext, comment body) rather than incrementing
+                                 the total for for each instance.
+        
+        :arg `max_threshold`: maximum relative frequency in the text a word can
+                              appear to be considered in word counts.
+                              prevents word spamming in a single submission.
+        
+    """
+    for entry in with_status(redditor.get_overview(limit=limit)):
         if isinstance(entry, praw.objects.Comment):  # Parse comment
-            parseText(entry.body)
+            parseText(entry.body, count_word_freqs, max_threshold)
         else:  # Parse submission
-            processSubmission(entry, include_comments=False)
+            processSubmission(entry, count_word_freqs, max_threshold, include_comments=False)
 
 
-def processSubmission(submission, include_comments=True):
-    """Parse a submission's text and body (if applicable).
-
-    Include the submission's comments when `include_comments` is True.
+def processSubmission(submission, count_word_freqs, max_threshold, include_comments=True):
+    """
+        Parse a submission's text and body (if applicable).
+
+        :arg `count_word_freqs`: only count a word once per text block (title,
+                                 selftext, comment body) rather than incrementing
+                                 the total for for each instance.
+        
+        :arg `max_threshold`: maximum relative frequency in the text a word can
+                              appear to be considered in word counts.
+                              prevents word spamming in a single submission.
+        
+        :arg `include_comments`: include the submission's comments when True
 
     """
     if include_comments:  # parse all the comments for the submission
         submission.replace_more_comments()
         for comment in praw.helpers.flatten_tree(submission.comments):
-            parseText(comment.body)
+            parseText(comment.body, count_word_freqs, max_threshold)
 
     # parse the title of the submission
-    parseText(submission.title)
+    parseText(submission.title, count_word_freqs, max_threshold)
 
     # parse the selftext of the submission (if applicable)
     if submission.is_self:
-        parseText(submission.selftext)
+        parseText(submission.selftext, count_word_freqs, max_threshold)
 
 
-def processSubreddit(subreddit):
-    """Parse comments, title text, and selftext in a given subreddit."""
+def processSubreddit(subreddit, period, limit, count_word_freqs, max_threshold):
+    """
+        Parse comments, title text, and selftext in a given subreddit.
+        
+        :arg `period`: the time period to scrape the subreddit over (day, week, month, etc.)
+        
+        :arg `limit`: the maximum number of submissions to scrape from the subreddit
+        
+        :arg `count_word_freqs`: only count a word once per text block (title,
+                                 selftext, comment body) rather than incrementing
+                                 the total for for each instance.
+        
+        :arg `max_threshold`: maximum relative frequency in the text a word can
+                              appear to be considered in word counts.
+                              prevents word spamming in a single submission.
+    """
 
     # determine period to count the words over
-    params = {'t': options.period}
-    for submission in with_status(subreddit.get_top(limit=options.limit,
-                                                    params=params)):
+    params = {'t': period}
+    for submission in with_status(subreddit.get_top(limit=limit, params=params)):
         try:
-            processSubmission(submission)
+            processSubmission(submission, count_word_freqs, max_threshold)
         except HTTPError as exc:
             sys.stderr.write("\nSkipping submission {0} due to HTTP status {1}"
                              " error. Continuing...\n"
@@ -173,7 +217,9 @@ def processSubreddit(subreddit):
 
 
 def with_status(iterable):
-    """Wrap an iterable outputing '.' for each item (up to 50 a line)."""
+    """
+        Wrap an iterable outputting '.' for each item (up to 100 per line).
+    """
     for i, item in enumerate(iterable):
         sys.stderr.write('.')
         sys.stderr.flush()
@@ -183,7 +229,9 @@ def with_status(iterable):
 
 
 def main():
-    user, target = parse_cmd_line()
+    
+    # parse the command-line options and arguments
+    user, target, options, args = parse_cmd_line()
 
     # open connection to Reddit
     r = praw.Reddit(user_agent="bot by /u/{0}".format(user))
@@ -196,12 +244,14 @@ def main():
     target = target[3:]
 
     if options.is_subreddit:
-        processSubreddit(r.get_subreddit(target))
+        processSubreddit(r.get_subreddit(target), options.period, options.limit,
+                         options.count_word_freqs, options.max_threshold)
     else:
-        processRedditor(r.get_redditor(target))
+        processRedditor(r.get_redditor(target), options.limit,
+                        options.count_word_freqs, options.max_threshold)
 
     # build a string containing all the words for the word cloud software
-    output = ""
+    output = "\n"
 
     # open output file to store the output string
     outFileName = target + ".csv"

From 8baa76f95a17bcb2c324cd11b07a81df9544f5c9 Mon Sep 17 00:00:00 2001
From: Randy Olson <rso@randalolson.com>
Date: Sun, 3 Mar 2013 17:55:40 -0500
Subject: [PATCH 2/3] Move tests. Modify tests for new functions.

---
 tests/test-functions.py => test-functions.py | 7 +++----
 tests/test.py => test.py                     | 0
 2 files changed, 3 insertions(+), 4 deletions(-)
 rename tests/test-functions.py => test-functions.py (91%)
 rename tests/test.py => test.py (100%)

diff --git a/tests/test-functions.py b/test-functions.py
similarity index 91%
rename from tests/test-functions.py
rename to test-functions.py
index 4257c56..7ee3e9f 100644
--- a/tests/test-functions.py
+++ b/test-functions.py
@@ -7,11 +7,10 @@
 class TestSequenceFunctions(unittest.TestCase):
 
     def setUp(self):
-        self.user, self.target = wf.parse_cmd_line()
         wf.popularWords = defaultdict(int)
 
     def test_parse_cmd_line(self):
-        self.user, self.target = wf.parse_cmd_line()
+        self.user, self.target, options, args = wf.parse_cmd_line()
         self.assertEqual(self.user, sys.argv[1])
         self.assertEqual(self.target, sys.argv[2])
 
@@ -27,7 +26,7 @@ def test_parseText(self):
         for word, freq in popularWords.items():
             txt += str((word + " ") * freq)
 
-        wf.parseText(txt)
+        wf.parseText(txt, count_word_freqs=True, max_threshold=0.34)
         self.assertEqual(popularWords, wf.popularWords)
 
         # TODO: still need to test:
@@ -52,7 +51,7 @@ def test_processSubmission(self):
         # parse a fixed thread
         # TODO: make our own test thread
         sub = r.get_submission(url="http://www.reddit.com/r/pics/comments/92dd8/test_post_please_ignore/")
-        wf.processSubmission(sub)
+        wf.processSubmission(sub, count_word_freqs=True, max_threshold=0.34)
         
         # only look at the top 10 most-used words in the thread
         # TODO: look at all words used in thread
diff --git a/tests/test.py b/test.py
similarity index 100%
rename from tests/test.py
rename to test.py

From 0124f931ed2710f4ce41f2d587e1c581729e908b Mon Sep 17 00:00:00 2001
From: Randy Olson <rso@randalolson.com>
Date: Sun, 3 Mar 2013 17:55:57 -0500
Subject: [PATCH 3/3] Modify all function calls to explicitly state each
 variable.

---
 word_freqs.py | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/word_freqs.py b/word_freqs.py
index 42760b0..cdd15af 100755
--- a/word_freqs.py
+++ b/word_freqs.py
@@ -112,7 +112,7 @@ def parseText(text, count_word_freqs, max_threshold):
     """
         Parse the passed in text and add words that are not common.
         
-        :arg `count_word_freqs`: only count a word once per text block (title,
+        :arg `count_word_freqs`: if False, only count a word once per text block (title,
                                  selftext, comment body) rather than incrementing
                                  the total for for each instance.
         
@@ -143,7 +143,7 @@ def processRedditor(redditor, limit, count_word_freqs, max_threshold):
         
         :arg `limit`: the maximum number of submissions to scrape from the subreddit
         
-        :arg `count_word_freqs`: only count a word once per text block (title,
+        :arg `count_word_freqs`: if False, only count a word once per text block (title,
                                  selftext, comment body) rather than incrementing
                                  the total for for each instance.
         
@@ -154,16 +154,18 @@ def processRedditor(redditor, limit, count_word_freqs, max_threshold):
     """
     for entry in with_status(redditor.get_overview(limit=limit)):
         if isinstance(entry, praw.objects.Comment):  # Parse comment
-            parseText(entry.body, count_word_freqs, max_threshold)
+            parseText(text=entry.body, count_word_freqs=count_word_freqs,
+                      max_threshold=max_threshold)
         else:  # Parse submission
-            processSubmission(entry, count_word_freqs, max_threshold, include_comments=False)
+            processSubmission(submission=entry, count_word_freqs=count_word_freqs,
+                              max_threshold=max_threshold, include_comments=False)
 
 
 def processSubmission(submission, count_word_freqs, max_threshold, include_comments=True):
     """
         Parse a submission's text and body (if applicable).
 
-        :arg `count_word_freqs`: only count a word once per text block (title,
+        :arg `count_word_freqs`: if False, only count a word once per text block (title,
                                  selftext, comment body) rather than incrementing
                                  the total for for each instance.
         
@@ -177,14 +179,17 @@ def processSubmission(submission, count_word_freqs, max_threshold, include_comme
     if include_comments:  # parse all the comments for the submission
         submission.replace_more_comments()
         for comment in praw.helpers.flatten_tree(submission.comments):
-            parseText(comment.body, count_word_freqs, max_threshold)
+            parseText(text=comment.body, count_word_freqs=count_word_freqs,
+                      max_threshold=max_threshold)
 
     # parse the title of the submission
-    parseText(submission.title, count_word_freqs, max_threshold)
+    parseText(text=submission.title, count_word_freqs=count_word_freqs,
+              max_threshold=max_threshold)
 
     # parse the selftext of the submission (if applicable)
     if submission.is_self:
-        parseText(submission.selftext, count_word_freqs, max_threshold)
+        parseText(text=submission.selftext, count_word_freqs=count_word_freqs,
+                  max_threshold=max_threshold)
 
 
 def processSubreddit(subreddit, period, limit, count_word_freqs, max_threshold):
@@ -195,7 +200,7 @@ def processSubreddit(subreddit, period, limit, count_word_freqs, max_threshold):
         
         :arg `limit`: the maximum number of submissions to scrape from the subreddit
         
-        :arg `count_word_freqs`: only count a word once per text block (title,
+        :arg `count_word_freqs`: if False, only count a word once per text block (title,
                                  selftext, comment body) rather than incrementing
                                  the total for for each instance.
         
@@ -206,9 +211,10 @@ def processSubreddit(subreddit, period, limit, count_word_freqs, max_threshold):
 
     # determine period to count the words over
     params = {'t': period}
-    for submission in with_status(subreddit.get_top(limit=limit, params=params)):
+    for submission in with_status(iterable=subreddit.get_top(limit=limit, params=params)):
         try:
-            processSubmission(submission, count_word_freqs, max_threshold)
+            processSubmission(submission=submission, count_word_freqs=count_word_freqs,
+                              max_threshold=max_threshold)
         except HTTPError as exc:
             sys.stderr.write("\nSkipping submission {0} due to HTTP status {1}"
                              " error. Continuing...\n"
@@ -244,11 +250,13 @@ def main():
     target = target[3:]
 
     if options.is_subreddit:
-        processSubreddit(r.get_subreddit(target), options.period, options.limit,
-                         options.count_word_freqs, options.max_threshold)
+        processSubreddit(subreddit=r.get_subreddit(target), period=options.period,
+                         limit=options.limit, count_word_freqs=options.count_word_freqs,
+                         max_threshold=options.max_threshold)
     else:
-        processRedditor(r.get_redditor(target), options.limit,
-                        options.count_word_freqs, options.max_threshold)
+        processRedditor(redditor=r.get_redditor(target), limit=options.limit,
+                        count_word_freqs=options.count_word_freqs,
+                        max_threshold=options.max_threshold)
 
     # build a string containing all the words for the word cloud software
     output = "\n"