From 959f686f71aa5e5da9a1a468b00598b71fffb52f Mon Sep 17 00:00:00 2001
From: ayuhsya <ayushm.kgp@gmail.com>
Date: Tue, 6 Dec 2016 22:56:09 +0530
Subject: [PATCH] Refactored the code with suggested changes

---
 sample/core.py    | 108 ++++++++++++++++++++++++++++++++++++++++++++++
 wiki_cat.py       |  45 -------------------
 wiki_images.py    |  44 -------------------
 wiki_linkshere.py |  38 ----------------
 4 files changed, 108 insertions(+), 127 deletions(-)
 create mode 100644 sample/core.py
 delete mode 100644 wiki_cat.py
 delete mode 100644 wiki_images.py
 delete mode 100644 wiki_linkshere.py

diff --git a/sample/core.py b/sample/core.py
new file mode 100644
index 0000000..072d254
--- /dev/null
+++ b/sample/core.py
@@ -0,0 +1,108 @@
+import requests
+from pprint import pprint
+import re
+
+
+class WikiPage:
+    def __init__(self, *pageid):
+        self.base_url = "https://en.wikipedia.org/w/api.php"
+
+        self.payload = {
+            'action': 'query',
+            'pageids': '|'.join(str(ID) for ID in list(pageid)),
+            'format': 'json',
+        }
+
+    # Method to fetch categories
+    def get_categories(self, get_hidden=False):
+        if self.payload['pageids'] is None:
+            return []
+
+        prop = 'categories'
+        self.payload['clshow'] = 'hidden' if get_hidden else '!hidden'
+        self.payload['prop'] = prop
+
+        res = requests.get(self.base_url, params=self.payload).json()
+        cat_list = _strip_JSON(res, prop, 'Category')
+
+        while 'continue' in res:
+            self.payload['clcontinue'] = res['continue']['clcontinue']
+            res = requests.get(self.base_url, params=self.payload).json()
+            _append_results(cat_list, res, prop, 'Category')
+
+        self.payload['clcontinue'] = None
+        self.payload['prop'] = None
+        return cat_list
+
+
+    # Method to fetch images
+    def get_images(self, imlimit='max',imdir='ascending'):
+        if self.payload['pageids'] is None:
+            return []
+
+        prop = 'images'
+        self.payload['prop'] = prop,
+        self.payload['imlimit'] = imlimit,
+        self.payload['imdir'] = imdir
+
+        res = requests.get(self.base_url, params=self.payload).json()
+
+        img_list = _strip_JSON(res, prop, 'File')
+        while 'continue' in res:
+            self.payload['imcontinue'] = res['continue']['imcontinue']
+            res = requests.get(self.base_url, params=self.payload).json()
+            _append_results(img_list, res, prop, 'File')
+
+        self.payload['imcontinue'] = None
+        self.payload['prop'] = None
+        return img_list
+
+    # Method to fetch linkshere
+    def get_linkshere(self, lhprop="pageid|title|redirect", lhlimit="max"):
+        if self.payload['pageids'] is None:
+            return []
+
+        prop = 'linkshere'
+        self.payload['prop'] = prop
+        self.payload['lhprop'] = lhprop
+        self.payload['lhlimit'] = lhlimit
+
+        lh_list = {}
+        res = requests.get(self.base_url, params=self.payload).json()
+
+        lh_list = _strip_JSON(res, prop, '_nothing-to-strip_')
+        while 'continue' in res:
+            self.payload['lhcontinue'] = res['continue']['lhcontinue']
+            res = requests.get(self.base_url, params=self.payload).json()
+            _append_results(lh_list, res, prop, '_nothing-to-strip_')
+
+        self.payload['lhcontinue'] = None
+        self.payload['prop'] = None
+        return lh_list
+
+
+def _strip_prop(text, prop):
+    return re.sub(prop+':', "", text, count=1)
+
+def _strip_JSON(res, prop, strip_chars):
+    ret = {}
+    for page_id, page_content in res['query']['pages'].items():
+        if prop not in page_content:
+            continue
+        ret[page_id] = [_strip_prop(entry['title'], strip_chars) for entry in page_content[prop]]
+    return ret
+
+def _append_results(currlist, newlist, prop, strip_chars):
+    ret = {}
+    ret = _strip_JSON(newlist, prop, strip_chars)
+    for key in ret:
+        if key not in currlist:
+            currlist[key] = []
+        currlist[key] += ret[key]
+
+if __name__ == "__main__":
+
+    wk = WikiPage('843158','20715044')
+    pprint(wk.get_categories())
+    pprint(wk.get_images())
+    pprint(wk.get_linkshere())
diff --git a/wiki_cat.py b/wiki_cat.py
deleted file mode 100644
index 0588ac0..0000000
--- a/wiki_cat.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import requests
-import re
-
-
-class WikiCatQuery(object):
-    def __init__(self):
-        self.base_url = "https://en.wikipedia.org/w/api.php"
-
-        self.payload = {
-            'action': 'query',
-            'prop': 'categories',
-            'pageids': None,
-            'format': 'json',
-            'clshow': '!hidden'
-        }
-
-    @staticmethod
-    def __strip_cat(text):
-        return re.sub("Category:", "", text, count=1)
-
-    def __res_cat_list(self, res):
-        return res['query']['pages'][str(self.payload['pageids'])]['categories']
-
-    def get_cat(self, pageid=None):
-        if pageid is None:
-            return []
-
-        self.payload['pageids'] = pageid
-        res = requests.get(self.base_url, params=self.payload).json()
-
-        cat_list = [self.__strip_cat(item['title']) for item in self.__res_cat_list(res)]
-        while 'continue' in res:
-            self.payload['clcontinue'] = res['continue']['clcontinue']
-            res = requests.get(self.base_url, params=self.payload).json()
-            cat_list += [self.__strip_cat(item['title']) for item in self.__res_cat_list(res)]
-
-        self.payload['pageids'] = None
-        return cat_list
-
-if __name__ == "__main__":
-    from pprint import pprint
-
-    wk = WikiCatQuery()
-    pprint(wk.get_cat(843158))
-    pprint(wk.get_cat(20715044))
diff --git a/wiki_images.py b/wiki_images.py
deleted file mode 100644
index 5ff9556..0000000
--- a/wiki_images.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Get the list of images for a list of pages
-import re
-from pprint import pprint
-
-import requests
-
-base_url = "https://en.wikipedia.org/w/api.php"
-payload = {
-    'action': 'query',
-    'prop': 'images',
-    'pageids': None,
-    'format': 'json',
-    'imlimit': 'max',
-    'imcontinue': None,
-    'imdir': 'ascending'
-}
-
-
-def __strip_file(text):
-    return re.sub("File:", "", text, count=1)
-
-
-def get_images(*pageids):
-    image_names = {}
-    payload['pageids'] = '|'.join(str(x) for x in list(pageids))
-    res = requests.get(base_url, params=payload).json()
-    for pageid, page_content in res['query']['pages'].items():
-        if 'images' not in page_content:
-            continue
-        image_names[pageid] = [__strip_file(image['title']) for image in page_content['images']]
-    while 'continue' in res:
-        payload['imcontinue'] = res['continue']['imcontinue']
-        res = requests.get(base_url, params=payload).json()
-        for pageid, page_content in res['query']['pages'].items():
-            if 'images' not in page_content:
-                continue
-            image_names[pageid] = image_names.get(pageid, []) + [__strip_file(image['title']) for image in
-                                                                 page_content['images']]
-    payload['imcontinue'] = None
-    payload['pageids'] = None
-    return image_names
-
-
-pprint(get_images(843158, 20715044))
diff --git a/wiki_linkshere.py b/wiki_linkshere.py
deleted file mode 100644
index 160bfcb..0000000
--- a/wiki_linkshere.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Find all pages that link to the given pages.
-from pprint import pprint
-
-import requests
-
-base_url = "https://en.wikipedia.org/w/api.php"
-payload = {
-    'action': 'query',
-    'prop': 'linkshere',
-    'pageids': None,
-    'format': 'json',
-    'lhprop': 'pageid|title|redirect',
-    'lhlimit': 'max',
-    'lhcontinue': None
-}
-
-# TODO: Return title and page ids both
-def get_linkshere(*pageids):
-    linkshere = {}
-    payload['pageids'] = '|'.join(str(x) for x in list(pageids))
-    res = requests.get(base_url, params=payload).json()
-    for pageid, page_content in res['query']['pages'].items():
-        if 'linkshere' not in page_content:
-            continue
-        linkshere[pageid] = linkshere.get(pageid, []) + [link['pageid'] for link in page_content['linkshere']]
-    while 'continue' in res:
-        payload['lhcontinue'] = res['continue']['lhcontinue']
-        res = requests.get(base_url, params=payload).json()
-        for pageid, page_content in res['query']['pages'].items():
-            if 'linkshere' not in page_content:
-                continue
-            linkshere[pageid] = linkshere.get(pageid, []) + [link['pageid'] for link in page_content['linkshere']]
-    payload['lhcontinue'] = None
-    payload['pageids'] = None
-    return linkshere
-
-
-pprint(get_linkshere(843158, 20715044))