Skip to content

Commit

Permalink
Refactor global options to local options.
Browse files Browse the repository at this point in the history
Global options moved to main. All functions now take whatever options
they need as parameters.
  • Loading branch information
rhiever committed Mar 3, 2013
1 parent 0677976 commit aa0b936
Showing 1 changed file with 83 additions and 33 deletions.
116 changes: 83 additions & 33 deletions word_freqs.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@
# put words here that you don't want to include in the word cloud
excludedWords = ["/", "--", "...", "deleted", ")x"]

# Global Variable Initialization
options = None


def parse_cmd_line():
# command-line argument parsing
"""

This comment has been minimized.

Copy link
@bboe

bboe Mar 3, 2013

Contributor

According to PEP257, this should just be something like:

"""Parse command line arguments."""

http://www.python.org/dev/peps/pep-0257/

This comment has been minimized.

Copy link
@rhiever

rhiever Mar 3, 2013

Author Owner

Didn't know there were docstring standards... but of course there are! :-P

command-line argument parsing
"""

usage = ("usage: %prog [options] USERNAME TARGET\n\n"
"USERNAME sets your Reddit username for the bot\n"
"TARGET sets the subreddit or user to count word frequencies for."
Expand Down Expand Up @@ -89,7 +89,6 @@ def parse_cmd_line():
"selftext, comment body) rather than incrementing"
"the total for for each instance."))

global options
options, args = parser.parse_args()

if len(args) != 2:
Expand All @@ -106,11 +105,21 @@ def parse_cmd_line():
if options.period not in ["day", "week", "month", "year", "all"]:
parser.error("Invalid period.")

return user, target
return user, target, options, args

This comment has been minimized.

Copy link
@bboe

bboe Mar 3, 2013

Contributor

args is isn't needed as the necessary data is in user and target.

This comment has been minimized.

Copy link
@rhiever

rhiever Mar 3, 2013

Author Owner

You mean args /isn't/ needed, right?

This comment has been minimized.

Copy link
@bboe

bboe Mar 3, 2013

Contributor

Yes :)



def parseText(text):
"""Parse the passed in text and add words that are not common."""
def parseText(text, count_word_freqs, max_threshold):
"""
Parse the passed in text and add words that are not common.

This comment has been minimized.

Copy link
@bboe

bboe Mar 3, 2013

Contributor

The one-line part should be on the same line as """ (again PEP 257).

:arg `count_word_freqs`: only count a word once per text block (title,

This comment has been minimized.

Copy link
@bboe

bboe Mar 3, 2013

Contributor

:arg should be :param this is for sphinx documentation. Also there is no need for the ` around the param name in that case.

This comment has been minimized.

Copy link
@rhiever

rhiever Mar 3, 2013

Author Owner

Oopsie.

selftext, comment body) rather than incrementing
the total for for each instance.
:arg `max_threshold`: maximum relative frequency in the text a word can
appear to be considered in word counts.
prevents word spamming in a single submission.
"""

This comment has been minimized.

Copy link
@bboe

bboe Mar 3, 2013

Contributor

There should be a blank line before the """. There is a tool pep257 that you can obtain which will verify pep257 conventions.

total = 0.0 # intentionally a float
text_words = defaultdict(int)
for word in text.split(): # Split on all whitespace
Expand All @@ -121,50 +130,85 @@ def parseText(text):

# Add to popularWords list
for word, count in text_words.items():
if count / total <= options.max_threshold:
if options.count_word_freqs:
if count / total <= max_threshold:
if count_word_freqs:
popularWords[word] += count
else:
popularWords[word] += 1


def processRedditor(redditor):
"""Parse submissions and comments for the given Redditor."""
for entry in with_status(redditor.get_overview(limit=options.limit)):
def processRedditor(redditor, limit, count_word_freqs, max_threshold):
"""
Parse submissions and comments for the given Redditor.
:arg `limit`: the maximum number of submissions to scrape from the subreddit
:arg `count_word_freqs`: only count a word once per text block (title,
selftext, comment body) rather than incrementing
the total for for each instance.
:arg `max_threshold`: maximum relative frequency in the text a word can
appear to be considered in word counts.
prevents word spamming in a single submission.
"""
for entry in with_status(redditor.get_overview(limit=limit)):
if isinstance(entry, praw.objects.Comment): # Parse comment
parseText(entry.body)
parseText(entry.body, count_word_freqs, max_threshold)
else: # Parse submission
processSubmission(entry, include_comments=False)
processSubmission(entry, count_word_freqs, max_threshold, include_comments=False)


def processSubmission(submission, include_comments=True):
"""Parse a submission's text and body (if applicable).
Include the submission's comments when `include_comments` is True.
def processSubmission(submission, count_word_freqs, max_threshold, include_comments=True):
"""
Parse a submission's text and body (if applicable).
:arg `count_word_freqs`: only count a word once per text block (title,
selftext, comment body) rather than incrementing
the total for for each instance.
:arg `max_threshold`: maximum relative frequency in the text a word can
appear to be considered in word counts.
prevents word spamming in a single submission.
:arg `include_comments`: include the submission's comments when True
"""
if include_comments: # parse all the comments for the submission
submission.replace_more_comments()
for comment in praw.helpers.flatten_tree(submission.comments):
parseText(comment.body)
parseText(comment.body, count_word_freqs, max_threshold)

# parse the title of the submission
parseText(submission.title)
parseText(submission.title, count_word_freqs, max_threshold)

# parse the selftext of the submission (if applicable)
if submission.is_self:
parseText(submission.selftext)
parseText(submission.selftext, count_word_freqs, max_threshold)


def processSubreddit(subreddit):
"""Parse comments, title text, and selftext in a given subreddit."""
def processSubreddit(subreddit, period, limit, count_word_freqs, max_threshold):
"""
Parse comments, title text, and selftext in a given subreddit.
:arg `period`: the time period to scrape the subreddit over (day, week, month, etc.)
:arg `limit`: the maximum number of submissions to scrape from the subreddit
:arg `count_word_freqs`: only count a word once per text block (title,
selftext, comment body) rather than incrementing
the total for for each instance.
:arg `max_threshold`: maximum relative frequency in the text a word can
appear to be considered in word counts.
prevents word spamming in a single submission.
"""

# determine period to count the words over
params = {'t': options.period}
for submission in with_status(subreddit.get_top(limit=options.limit,
params=params)):
params = {'t': period}
for submission in with_status(subreddit.get_top(limit=limit, params=params)):
try:
processSubmission(submission)
processSubmission(submission, count_word_freqs, max_threshold)
except HTTPError as exc:
sys.stderr.write("\nSkipping submission {0} due to HTTP status {1}"
" error. Continuing...\n"
Expand All @@ -173,7 +217,9 @@ def processSubreddit(subreddit):


def with_status(iterable):
"""Wrap an iterable outputing '.' for each item (up to 50 a line)."""
"""
Wrap an iterable outputting '.' for each item (up to 100 per line).
"""
for i, item in enumerate(iterable):
sys.stderr.write('.')
sys.stderr.flush()
Expand All @@ -183,7 +229,9 @@ def with_status(iterable):


def main():
user, target = parse_cmd_line()

# parse the command-line options and arguments
user, target, options, args = parse_cmd_line()

# open connection to Reddit
r = praw.Reddit(user_agent="bot by /u/{0}".format(user))
Expand All @@ -196,12 +244,14 @@ def main():
target = target[3:]

if options.is_subreddit:
processSubreddit(r.get_subreddit(target))
processSubreddit(r.get_subreddit(target), options.period, options.limit,
options.count_word_freqs, options.max_threshold)
else:
processRedditor(r.get_redditor(target))
processRedditor(r.get_redditor(target), options.limit,
options.count_word_freqs, options.max_threshold)

# build a string containing all the words for the word cloud software
output = ""
output = "\n"

This comment has been minimized.

Copy link
@bboe

bboe Mar 3, 2013

Contributor

Might actually want to add \n to the end of stderr instead as the stdout output will look somewhat weird with an empty space at the beginning. The newline would go well at the end of with_status.


# open output file to store the output string
outFileName = target + ".csv"
Expand Down

0 comments on commit aa0b936

Please sign in to comment.