From b679c68413c701da6cd8e615cd548f50fea68ce3 Mon Sep 17 00:00:00 2001 From: YiMing Date: Tue, 18 Aug 2020 18:18:43 +0800 Subject: [PATCH 1/6] [refactor] fix Profile object --- twitter_scraper/modules/profile.py | 62 ++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/twitter_scraper/modules/profile.py b/twitter_scraper/modules/profile.py index ffd3f2e..2680c8a 100644 --- a/twitter_scraper/modules/profile.py +++ b/twitter_scraper/modules/profile.py @@ -30,7 +30,8 @@ def __init__(self, username): headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Referer": f"https://twitter.com/{username}", - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8", + # Getting mobile webpage by using Chrome < 38 + "User-Agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062 Safari/537.36', "X-Twitter-Active-User": "yes", "X-Requested-With": "XMLHttpRequest", "Accept-Language": "en-US", @@ -56,14 +57,20 @@ def __parse_profile(self, page): except: self.is_private = False + # blue badge + self.is_verified = True try: - self.is_verified = html.find(".ProfileHeaderCard-badges .Icon--verified")[0] - self.is_verified = True + q = html.find("a.badge")[0] + if not q: + self.is_verified = False except: self.is_verified = False - self.location = html.find(".ProfileHeaderCard-locationText")[0].text - if not self.location: + try: + self.location = html.find('div.location')[0].text + if not self.location: + self.location = None + except: self.location = None self.birthday = html.find(".ProfileHeaderCard-birthdateText")[0].text @@ -72,7 +79,10 @@ def __parse_profile(self, page): else: self.birthday = None - self.profile_photo = html.find(".ProfileAvatar-image")[0].attrs["src"] + try: + self.profile_photo = html.find("td.avatar img")[0].attrs["src"] + except: + self.profile_photo = None try: self.banner_photo = html.find(".ProfileCanopy-headerBg img")[0].attrs["src"] @@ -84,32 +94,50 @@ def __parse_profile(self, page): self.user_id = html.find(".ProfileNav")[0].attrs["data-user-id"] - self.biography = html.find(".ProfileHeaderCard-bio")[0].text - if not self.birthday: - self.birthday = None + try: + self.biography = html.find("div.bio div.dir-ltr")[0].text + if not self.biography: + self.biography = None + except: + self.biography = None - self.website = html.find(".ProfileHeaderCard-urlText")[0].text - if not self.website: + try: + self.website = html.find("div.url div.dir-ltr")[0].text + if not self.website: + self.website = None + except: self.website = None + # get stats table if available + stats_table = None + stats = None + try: + stats_table = html.find('table.profile-stats')[0] + stats = stats_table.find('td div.statnum') + if not stats: + self.tweets_count = None + self.following_count = None + self.followers_count = None + except: + self.tweets_count = None + self.following_count = None + self.followers_count = None + # get total tweets count if available try: - q = html.find('li[class*="--tweets"] span[data-count]')[0].attrs["data-count"] - self.tweets_count = int(q) + self.tweets_count = int(stats[0].text.replace(',','')) except: self.tweets_count = None # get total following count if available try: - q = html.find('li[class*="--following"] span[data-count]')[0].attrs["data-count"] - self.following_count = int(q) + self.following_count = int(stats[1].text.replace(',','')) except: self.following_count = None # get total follower count if available try: - q = html.find('li[class*="--followers"] span[data-count]')[0].attrs["data-count"] - self.followers_count = int(q) + self.followers_count = int(stats[2].text.replace(',','')) except: self.followers_count = None From 9fa74437598afe86c12ea0522897f4bbc1ee3072 Mon Sep 17 00:00:00 2001 From: YiMing Date: Wed, 19 Aug 2020 16:09:36 +0800 Subject: [PATCH 2/6] [comment] adding TODO 1. is_private 2. birthday 3. favorites --- twitter_scraper/modules/profile.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/twitter_scraper/modules/profile.py b/twitter_scraper/modules/profile.py index 2680c8a..001fbe7 100644 --- a/twitter_scraper/modules/profile.py +++ b/twitter_scraper/modules/profile.py @@ -50,7 +50,8 @@ def __parse_profile(self, page): ) except ParserError: pass - + + # TODO unfixed try: self.is_private = html.find(".ProfileHeaderCard-badges .Icon--protected")[0] self.is_private = True @@ -73,6 +74,7 @@ def __parse_profile(self, page): except: self.location = None + # TODO unfixed self.birthday = html.find(".ProfileHeaderCard-birthdateText")[0].text if self.birthday: self.birthday = self.birthday.replace("Born ", "") @@ -142,6 +144,7 @@ def __parse_profile(self, page): self.followers_count = None # get total like count if available + # TODO unfixed try: q = html.find('li[class*="--favorites"] span[data-count]')[0].attrs["data-count"] self.likes_count = int(q) From 5bb60d3fe560193f721ddd4fc7b71889faa7e90e Mon Sep 17 00:00:00 2001 From: yimingstar Date: Thu, 10 Sep 2020 11:21:46 +0800 Subject: [PATCH 3/6] [refactor] add exception for all column --- twitter_scraper/modules/profile.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/twitter_scraper/modules/profile.py b/twitter_scraper/modules/profile.py index 001fbe7..17a0144 100644 --- a/twitter_scraper/modules/profile.py +++ b/twitter_scraper/modules/profile.py @@ -74,11 +74,14 @@ def __parse_profile(self, page): except: self.location = None - # TODO unfixed - self.birthday = html.find(".ProfileHeaderCard-birthdateText")[0].text - if self.birthday: - self.birthday = self.birthday.replace("Born ", "") - else: + # TODO cannot find ProfileHeaderCard-birthdateText + try: + self.birthday = html.find(".ProfileHeaderCard-birthdateText")[0].text + if self.birthday: + self.birthday = self.birthday.replace("Born ", "") + else: + self.birthday = None + except: self.birthday = None try: @@ -88,13 +91,19 @@ def __parse_profile(self, page): try: self.banner_photo = html.find(".ProfileCanopy-headerBg img")[0].attrs["src"] - except KeyError: + except: self.banner_photo = None - page_title = html.find("title")[0].text - self.name = page_title[: page_title.find("(")].strip() - - self.user_id = html.find(".ProfileNav")[0].attrs["data-user-id"] + try: + page_title = html.find("title")[0].text + self.name = page_title[: page_title.find("(")].strip() + except: + self.name = None + + try: + self.user_id = html.find(".ProfileNav")[0].attrs["data-user-id"] + except: + self.user_id = None try: self.biography = html.find("div.bio div.dir-ltr")[0].text From cb8244ee6464fec985875cffe584f0e2ca0a2ebf Mon Sep 17 00:00:00 2001 From: yimingstar Date: Thu, 10 Sep 2020 11:29:22 +0800 Subject: [PATCH 4/6] adding TODO comments --- twitter_scraper/modules/profile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/twitter_scraper/modules/profile.py b/twitter_scraper/modules/profile.py index 17a0144..a2e0835 100644 --- a/twitter_scraper/modules/profile.py +++ b/twitter_scraper/modules/profile.py @@ -51,7 +51,7 @@ def __parse_profile(self, page): except ParserError: pass - # TODO unfixed + # TODO cannot find ProfileHeaderCard-badges try: self.is_private = html.find(".ProfileHeaderCard-badges .Icon--protected")[0] self.is_private = True @@ -89,6 +89,7 @@ def __parse_profile(self, page): except: self.profile_photo = None + # TODO cannot find ProfileCanopy-headerBg try: self.banner_photo = html.find(".ProfileCanopy-headerBg img")[0].attrs["src"] except: From c7bbf55f5bfe94255242035f82237123f4581c38 Mon Sep 17 00:00:00 2001 From: YiMing Date: Tue, 29 Sep 2020 14:50:07 +0800 Subject: [PATCH 5/6] [refactor] adding exceptions for all attributes --- twitter_scraper/modules/profile.py | 47 ++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/twitter_scraper/modules/profile.py b/twitter_scraper/modules/profile.py index a2e0835..822de6a 100644 --- a/twitter_scraper/modules/profile.py +++ b/twitter_scraper/modules/profile.py @@ -40,7 +40,7 @@ def __init__(self, username): page = session.get(f"https://twitter.com/{username}", headers=headers) self.username = username self.__parse_profile(page) - + def __parse_profile(self, page): try: html = HTML(html=page.text, url="bunk", default_encoding="utf-8") @@ -55,7 +55,8 @@ def __parse_profile(self, page): try: self.is_private = html.find(".ProfileHeaderCard-badges .Icon--protected")[0] self.is_private = True - except: + except Exception as e: + self.__failed_fetching('is_private', e) self.is_private = False # blue badge @@ -64,14 +65,16 @@ def __parse_profile(self, page): q = html.find("a.badge")[0] if not q: self.is_verified = False - except: + except Exception as e: + self.__failed_fetching('is_verified', e) self.is_verified = False try: self.location = html.find('div.location')[0].text if not self.location: self.location = None - except: + except Exception as e: + self.__failed_fetching('location', e) self.location = None # TODO cannot find ProfileHeaderCard-birthdateText @@ -81,43 +84,50 @@ def __parse_profile(self, page): self.birthday = self.birthday.replace("Born ", "") else: self.birthday = None - except: + except Exception as e: + self.__failed_fetching('birthday', e) self.birthday = None try: self.profile_photo = html.find("td.avatar img")[0].attrs["src"] - except: + except Exception as e: + self.__failed_fetching('profile_photo', e) self.profile_photo = None # TODO cannot find ProfileCanopy-headerBg try: self.banner_photo = html.find(".ProfileCanopy-headerBg img")[0].attrs["src"] - except: + except Exception as e: + self.__failed_fetching('banner_photo', e) self.banner_photo = None try: page_title = html.find("title")[0].text self.name = page_title[: page_title.find("(")].strip() - except: + except Exception as e: + self.__failed_fetching('name', e) self.name = None try: self.user_id = html.find(".ProfileNav")[0].attrs["data-user-id"] - except: + except Exception as e: + self.__failed_fetching('user_id', e) self.user_id = None try: self.biography = html.find("div.bio div.dir-ltr")[0].text if not self.biography: self.biography = None - except: + except Exception as e: + self.__failed_fetching('biography', e) self.biography = None try: self.website = html.find("div.url div.dir-ltr")[0].text if not self.website: self.website = None - except: + except Exception as e: + self.__failed_fetching('website', e) self.website = None # get stats table if available @@ -138,19 +148,22 @@ def __parse_profile(self, page): # get total tweets count if available try: self.tweets_count = int(stats[0].text.replace(',','')) - except: + except Exception as e: + self.__failed_fetching('tweets_count', e) self.tweets_count = None # get total following count if available try: self.following_count = int(stats[1].text.replace(',','')) - except: + except Exception as e: + self.__failed_fetching('following_count', e) self.following_count = None # get total follower count if available try: self.followers_count = int(stats[2].text.replace(',','')) - except: + except Exception as e: + self.__failed_fetching('followers_count', e) self.followers_count = None # get total like count if available @@ -158,9 +171,13 @@ def __parse_profile(self, page): try: q = html.find('li[class*="--favorites"] span[data-count]')[0].attrs["data-count"] self.likes_count = int(q) - except: + except Exception as e: + self.__failed_fetching('likes_count', e) self.likes_count = None + def __failed_fetching(self, var: str, except_msg: str): + print(f'Unable to get {var} in html, exception - {except_msg}') + def to_dict(self): return dict( name=self.name, From f13ec7ff78cea5cf146a535a8d653d4d38f7d4fa Mon Sep 17 00:00:00 2001 From: YiMing Date: Tue, 29 Sep 2020 14:52:51 +0800 Subject: [PATCH 6/6] [refactor] rearrange logic of getting few attributes stats - tweets_count - following_count - followers_count --- twitter_scraper/modules/profile.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/twitter_scraper/modules/profile.py b/twitter_scraper/modules/profile.py index 822de6a..e41afaa 100644 --- a/twitter_scraper/modules/profile.py +++ b/twitter_scraper/modules/profile.py @@ -136,14 +136,8 @@ def __parse_profile(self, page): try: stats_table = html.find('table.profile-stats')[0] stats = stats_table.find('td div.statnum') - if not stats: - self.tweets_count = None - self.following_count = None - self.followers_count = None except: - self.tweets_count = None - self.following_count = None - self.followers_count = None + self.stats = None # get total tweets count if available try: