Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Global var refactor #25

Merged
merged 3 commits into from
Mar 3, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions tests/test-functions.py → test-functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@
class TestSequenceFunctions(unittest.TestCase):

def setUp(self):
self.user, self.target = wf.parse_cmd_line()
wf.popularWords = defaultdict(int)

def test_parse_cmd_line(self):
self.user, self.target = wf.parse_cmd_line()
self.user, self.target, options, args = wf.parse_cmd_line()
self.assertEqual(self.user, sys.argv[1])
self.assertEqual(self.target, sys.argv[2])

Expand All @@ -27,7 +26,7 @@ def test_parseText(self):
for word, freq in popularWords.items():
txt += str((word + " ") * freq)

wf.parseText(txt)
wf.parseText(txt, count_word_freqs=True, max_threshold=0.34)
self.assertEqual(popularWords, wf.popularWords)

# TODO: still need to test:
Expand All @@ -52,7 +51,7 @@ def test_processSubmission(self):
# parse a fixed thread
# TODO: make our own test thread
sub = r.get_submission(url="http://www.reddit.com/r/pics/comments/92dd8/test_post_please_ignore/")
wf.processSubmission(sub)
wf.processSubmission(sub, count_word_freqs=True, max_threshold=0.34)

# only look at the top 10 most-used words in the thread
# TODO: look at all words used in thread
Expand Down
File renamed without changes.
124 changes: 91 additions & 33 deletions word_freqs.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@
# put words here that you don't want to include in the word cloud
excludedWords = ["/", "--", "...", "deleted", ")x"]

# Global Variable Initialization
options = None


def parse_cmd_line():
# command-line argument parsing
"""
command-line argument parsing
"""

usage = ("usage: %prog [options] USERNAME TARGET\n\n"
"USERNAME sets your Reddit username for the bot\n"
"TARGET sets the subreddit or user to count word frequencies for."
Expand Down Expand Up @@ -89,7 +89,6 @@ def parse_cmd_line():
"selftext, comment body) rather than incrementing"
"the total for for each instance."))

global options
options, args = parser.parse_args()

if len(args) != 2:
Expand All @@ -106,11 +105,21 @@ def parse_cmd_line():
if options.period not in ["day", "week", "month", "year", "all"]:
parser.error("Invalid period.")

return user, target
return user, target, options, args


def parseText(text):
"""Parse the passed in text and add words that are not common."""
def parseText(text, count_word_freqs, max_threshold):
"""
Parse the passed in text and add words that are not common.

:arg `count_word_freqs`: if False, only count a word once per text block (title,
selftext, comment body) rather than incrementing
the total for for each instance.

:arg `max_threshold`: maximum relative frequency in the text a word can
appear to be considered in word counts.
prevents word spamming in a single submission.
"""
total = 0.0 # intentionally a float
text_words = defaultdict(int)
for word in text.split(): # Split on all whitespace
Expand All @@ -121,50 +130,91 @@ def parseText(text):

# Add to popularWords list
for word, count in text_words.items():
if count / total <= options.max_threshold:
if options.count_word_freqs:
if count / total <= max_threshold:
if count_word_freqs:
popularWords[word] += count
else:
popularWords[word] += 1


def processRedditor(redditor):
"""Parse submissions and comments for the given Redditor."""
for entry in with_status(redditor.get_overview(limit=options.limit)):
def processRedditor(redditor, limit, count_word_freqs, max_threshold):
"""
Parse submissions and comments for the given Redditor.

:arg `limit`: the maximum number of submissions to scrape from the subreddit

:arg `count_word_freqs`: if False, only count a word once per text block (title,
selftext, comment body) rather than incrementing
the total for for each instance.

:arg `max_threshold`: maximum relative frequency in the text a word can
appear to be considered in word counts.
prevents word spamming in a single submission.

"""
for entry in with_status(redditor.get_overview(limit=limit)):
if isinstance(entry, praw.objects.Comment): # Parse comment
parseText(entry.body)
parseText(text=entry.body, count_word_freqs=count_word_freqs,
max_threshold=max_threshold)
else: # Parse submission
processSubmission(entry, include_comments=False)
processSubmission(submission=entry, count_word_freqs=count_word_freqs,
max_threshold=max_threshold, include_comments=False)


def processSubmission(submission, include_comments=True):
"""Parse a submission's text and body (if applicable).

Include the submission's comments when `include_comments` is True.
def processSubmission(submission, count_word_freqs, max_threshold, include_comments=True):
"""
Parse a submission's text and body (if applicable).

:arg `count_word_freqs`: if False, only count a word once per text block (title,
selftext, comment body) rather than incrementing
the total for for each instance.

:arg `max_threshold`: maximum relative frequency in the text a word can
appear to be considered in word counts.
prevents word spamming in a single submission.

:arg `include_comments`: include the submission's comments when True

"""
if include_comments: # parse all the comments for the submission
submission.replace_more_comments()
for comment in praw.helpers.flatten_tree(submission.comments):
parseText(comment.body)
parseText(text=comment.body, count_word_freqs=count_word_freqs,
max_threshold=max_threshold)

# parse the title of the submission
parseText(submission.title)
parseText(text=submission.title, count_word_freqs=count_word_freqs,
max_threshold=max_threshold)

# parse the selftext of the submission (if applicable)
if submission.is_self:
parseText(submission.selftext)
parseText(text=submission.selftext, count_word_freqs=count_word_freqs,
max_threshold=max_threshold)


def processSubreddit(subreddit):
"""Parse comments, title text, and selftext in a given subreddit."""
def processSubreddit(subreddit, period, limit, count_word_freqs, max_threshold):
"""
Parse comments, title text, and selftext in a given subreddit.

:arg `period`: the time period to scrape the subreddit over (day, week, month, etc.)

:arg `limit`: the maximum number of submissions to scrape from the subreddit

:arg `count_word_freqs`: if False, only count a word once per text block (title,
selftext, comment body) rather than incrementing
the total for for each instance.

:arg `max_threshold`: maximum relative frequency in the text a word can
appear to be considered in word counts.
prevents word spamming in a single submission.
"""

# determine period to count the words over
params = {'t': options.period}
for submission in with_status(subreddit.get_top(limit=options.limit,
params=params)):
params = {'t': period}
for submission in with_status(iterable=subreddit.get_top(limit=limit, params=params)):
try:
processSubmission(submission)
processSubmission(submission=submission, count_word_freqs=count_word_freqs,
max_threshold=max_threshold)
except HTTPError as exc:
sys.stderr.write("\nSkipping submission {0} due to HTTP status {1}"
" error. Continuing...\n"
Expand All @@ -173,7 +223,9 @@ def processSubreddit(subreddit):


def with_status(iterable):
"""Wrap an iterable outputing '.' for each item (up to 50 a line)."""
"""
Wrap an iterable outputting '.' for each item (up to 100 per line).
"""
for i, item in enumerate(iterable):
sys.stderr.write('.')
sys.stderr.flush()
Expand All @@ -183,7 +235,9 @@ def with_status(iterable):


def main():
user, target = parse_cmd_line()

# parse the command-line options and arguments
user, target, options, args = parse_cmd_line()

# open connection to Reddit
r = praw.Reddit(user_agent="bot by /u/{0}".format(user))
Expand All @@ -196,12 +250,16 @@ def main():
target = target[3:]

if options.is_subreddit:
processSubreddit(r.get_subreddit(target))
processSubreddit(subreddit=r.get_subreddit(target), period=options.period,
limit=options.limit, count_word_freqs=options.count_word_freqs,
max_threshold=options.max_threshold)
else:
processRedditor(r.get_redditor(target))
processRedditor(redditor=r.get_redditor(target), limit=options.limit,
count_word_freqs=options.count_word_freqs,
max_threshold=options.max_threshold)

# build a string containing all the words for the word cloud software
output = ""
output = "\n"

# open output file to store the output string
outFileName = target + ".csv"
Expand Down