From 959f686f71aa5e5da9a1a468b00598b71fffb52f Mon Sep 17 00:00:00 2001 From: ayuhsya Date: Tue, 6 Dec 2016 22:56:09 +0530 Subject: [PATCH] Refactored the code with suggested changes --- sample/core.py | 108 ++++++++++++++++++++++++++++++++++++++++++++++ wiki_cat.py | 45 ------------------- wiki_images.py | 44 ------------------- wiki_linkshere.py | 38 ---------------- 4 files changed, 108 insertions(+), 127 deletions(-) create mode 100644 sample/core.py delete mode 100644 wiki_cat.py delete mode 100644 wiki_images.py delete mode 100644 wiki_linkshere.py diff --git a/sample/core.py b/sample/core.py new file mode 100644 index 0000000..072d254 --- /dev/null +++ b/sample/core.py @@ -0,0 +1,108 @@ +import requests +from pprint import pprint +import re + + +class WikiPage: + def __init__(self, *pageid): + self.base_url = "https://en.wikipedia.org/w/api.php" + + self.payload = { + 'action': 'query', + 'pageids': '|'.join(str(ID) for ID in list(pageid)), + 'format': 'json', + } + + # Method to fetch categories + def get_categories(self, get_hidden=False): + if self.payload['pageids'] is None: + return [] + + prop = 'categories' + self.payload['clshow'] = 'hidden' if get_hidden else '!hidden' + self.payload['prop'] = prop + + res = requests.get(self.base_url, params=self.payload).json() + cat_list = _strip_JSON(res, prop, 'Category') + + while 'continue' in res: + self.payload['clcontinue'] = res['continue']['clcontinue'] + res = requests.get(self.base_url, params=self.payload).json() + _append_results(cat_list, res, prop, 'Category') + + self.payload['clcontinue'] = None + self.payload['prop'] = None + return cat_list + + + # Method to fetch images + def get_images(self, imlimit='max',imdir='ascending'): + if self.payload['pageids'] is None: + return [] + + prop = 'images' + self.payload['prop'] = prop, + self.payload['imlimit'] = imlimit, + self.payload['imdir'] = imdir + + res = requests.get(self.base_url, params=self.payload).json() + + img_list = _strip_JSON(res, prop, 'File') + while 'continue' in res: + self.payload['imcontinue'] = res['continue']['imcontinue'] + res = requests.get(self.base_url, params=self.payload).json() + _append_results(img_list, res, prop, 'File') + + self.payload['imcontinue'] = None + self.payload['prop'] = None + return img_list + + # Method to fetch linkshere + def get_linkshere(self, lhprop="pageid|title|redirect", lhlimit="max"): + if self.payload['pageids'] is None: + return [] + + prop = 'linkshere' + self.payload['prop'] = prop + self.payload['lhprop'] = lhprop + self.payload['lhlimit'] = lhlimit + + lh_list = {} + res = requests.get(self.base_url, params=self.payload).json() + + lh_list = _strip_JSON(res, prop, '_nothing-to-strip_') + while 'continue' in res: + self.payload['lhcontinue'] = res['continue']['lhcontinue'] + res = requests.get(self.base_url, params=self.payload).json() + _append_results(lh_list, res, prop, '_nothing-to-strip_') + + self.payload['lhcontinue'] = None + self.payload['prop'] = None + return lh_list + + +def _strip_prop(text, prop): + return re.sub(prop+':', "", text, count=1) + +def _strip_JSON(res, prop, strip_chars): + ret = {} + for page_id, page_content in res['query']['pages'].items(): + if prop not in page_content: + continue + ret[page_id] = [_strip_prop(entry['title'], strip_chars) for entry in page_content[prop]] + return ret + +def _append_results(currlist, newlist, prop, strip_chars): + ret = {} + ret = _strip_JSON(newlist, prop, strip_chars) + for key in ret: + if key not in currlist: + currlist[key] = [] + currlist[key] += ret[key] + +if __name__ == "__main__": + + wk = WikiPage('843158','20715044') + pprint(wk.get_categories()) + pprint(wk.get_images()) + pprint(wk.get_linkshere()) diff --git a/wiki_cat.py b/wiki_cat.py deleted file mode 100644 index 0588ac0..0000000 --- a/wiki_cat.py +++ /dev/null @@ -1,45 +0,0 @@ -import requests -import re - - -class WikiCatQuery(object): - def __init__(self): - self.base_url = "https://en.wikipedia.org/w/api.php" - - self.payload = { - 'action': 'query', - 'prop': 'categories', - 'pageids': None, - 'format': 'json', - 'clshow': '!hidden' - } - - @staticmethod - def __strip_cat(text): - return re.sub("Category:", "", text, count=1) - - def __res_cat_list(self, res): - return res['query']['pages'][str(self.payload['pageids'])]['categories'] - - def get_cat(self, pageid=None): - if pageid is None: - return [] - - self.payload['pageids'] = pageid - res = requests.get(self.base_url, params=self.payload).json() - - cat_list = [self.__strip_cat(item['title']) for item in self.__res_cat_list(res)] - while 'continue' in res: - self.payload['clcontinue'] = res['continue']['clcontinue'] - res = requests.get(self.base_url, params=self.payload).json() - cat_list += [self.__strip_cat(item['title']) for item in self.__res_cat_list(res)] - - self.payload['pageids'] = None - return cat_list - -if __name__ == "__main__": - from pprint import pprint - - wk = WikiCatQuery() - pprint(wk.get_cat(843158)) - pprint(wk.get_cat(20715044)) diff --git a/wiki_images.py b/wiki_images.py deleted file mode 100644 index 5ff9556..0000000 --- a/wiki_images.py +++ /dev/null @@ -1,44 +0,0 @@ -# Get the list of images for a list of pages -import re -from pprint import pprint - -import requests - -base_url = "https://en.wikipedia.org/w/api.php" -payload = { - 'action': 'query', - 'prop': 'images', - 'pageids': None, - 'format': 'json', - 'imlimit': 'max', - 'imcontinue': None, - 'imdir': 'ascending' -} - - -def __strip_file(text): - return re.sub("File:", "", text, count=1) - - -def get_images(*pageids): - image_names = {} - payload['pageids'] = '|'.join(str(x) for x in list(pageids)) - res = requests.get(base_url, params=payload).json() - for pageid, page_content in res['query']['pages'].items(): - if 'images' not in page_content: - continue - image_names[pageid] = [__strip_file(image['title']) for image in page_content['images']] - while 'continue' in res: - payload['imcontinue'] = res['continue']['imcontinue'] - res = requests.get(base_url, params=payload).json() - for pageid, page_content in res['query']['pages'].items(): - if 'images' not in page_content: - continue - image_names[pageid] = image_names.get(pageid, []) + [__strip_file(image['title']) for image in - page_content['images']] - payload['imcontinue'] = None - payload['pageids'] = None - return image_names - - -pprint(get_images(843158, 20715044)) diff --git a/wiki_linkshere.py b/wiki_linkshere.py deleted file mode 100644 index 160bfcb..0000000 --- a/wiki_linkshere.py +++ /dev/null @@ -1,38 +0,0 @@ -# Find all pages that link to the given pages. -from pprint import pprint - -import requests - -base_url = "https://en.wikipedia.org/w/api.php" -payload = { - 'action': 'query', - 'prop': 'linkshere', - 'pageids': None, - 'format': 'json', - 'lhprop': 'pageid|title|redirect', - 'lhlimit': 'max', - 'lhcontinue': None -} - -# TODO: Return title and page ids both -def get_linkshere(*pageids): - linkshere = {} - payload['pageids'] = '|'.join(str(x) for x in list(pageids)) - res = requests.get(base_url, params=payload).json() - for pageid, page_content in res['query']['pages'].items(): - if 'linkshere' not in page_content: - continue - linkshere[pageid] = linkshere.get(pageid, []) + [link['pageid'] for link in page_content['linkshere']] - while 'continue' in res: - payload['lhcontinue'] = res['continue']['lhcontinue'] - res = requests.get(base_url, params=payload).json() - for pageid, page_content in res['query']['pages'].items(): - if 'linkshere' not in page_content: - continue - linkshere[pageid] = linkshere.get(pageid, []) + [link['pageid'] for link in page_content['linkshere']] - payload['lhcontinue'] = None - payload['pageids'] = None - return linkshere - - -pprint(get_linkshere(843158, 20715044))