Skip to content
This repository has been archived by the owner on Aug 12, 2024. It is now read-only.

[refactor] fix Profile object #167

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 68 additions & 27 deletions twitter_scraper/modules/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def __init__(self, username):
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Referer": f"https://twitter.com/{username}",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8",
# Getting mobile webpage by using Chrome < 38
"User-Agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062 Safari/537.36',
"X-Twitter-Active-User": "yes",
"X-Requested-With": "XMLHttpRequest",
"Accept-Language": "en-US",
Expand All @@ -49,71 +50,111 @@ def __parse_profile(self, page):
)
except ParserError:
pass

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You shouldn't ignore a fatal error.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that was in the original package, so it's not related to this PR. Still, it should be ignored



# TODO cannot find ProfileHeaderCard-badges
try:
self.is_private = html.find(".ProfileHeaderCard-badges .Icon--protected")[0]
self.is_private = True
except:
self.is_private = False

# blue badge
self.is_verified = True
try:
self.is_verified = html.find(".ProfileHeaderCard-badges .Icon--verified")[0]
self.is_verified = True
q = html.find("a.badge")[0]
if not q:
self.is_verified = False
except:
self.is_verified = False

self.location = html.find(".ProfileHeaderCard-locationText")[0].text
if not self.location:
try:
self.location = html.find('div.location')[0].text
if not self.location:
self.location = None
except:
self.location = None

self.birthday = html.find(".ProfileHeaderCard-birthdateText")[0].text
if self.birthday:
self.birthday = self.birthday.replace("Born ", "")
else:
# TODO cannot find ProfileHeaderCard-birthdateText
try:
self.birthday = html.find(".ProfileHeaderCard-birthdateText")[0].text
if self.birthday:
self.birthday = self.birthday.replace("Born ", "")
else:
self.birthday = None
except:
self.birthday = None

self.profile_photo = html.find(".ProfileAvatar-image")[0].attrs["src"]
try:
self.profile_photo = html.find("td.avatar img")[0].attrs["src"]
except:
self.profile_photo = None

# TODO cannot find ProfileCanopy-headerBg
try:
self.banner_photo = html.find(".ProfileCanopy-headerBg img")[0].attrs["src"]
except KeyError:
except:
self.banner_photo = None

page_title = html.find("title")[0].text
self.name = page_title[: page_title.find("(")].strip()

self.user_id = html.find(".ProfileNav")[0].attrs["data-user-id"]
try:
page_title = html.find("title")[0].text
self.name = page_title[: page_title.find("(")].strip()
except:
self.name = None

try:
self.user_id = html.find(".ProfileNav")[0].attrs["data-user-id"]
except:
self.user_id = None

self.biography = html.find(".ProfileHeaderCard-bio")[0].text
if not self.birthday:
self.birthday = None
try:
self.biography = html.find("div.bio div.dir-ltr")[0].text
if not self.biography:
self.biography = None
except:
self.biography = None

self.website = html.find(".ProfileHeaderCard-urlText")[0].text
if not self.website:
try:
self.website = html.find("div.url div.dir-ltr")[0].text
if not self.website:
self.website = None
except:
self.website = None

# get stats table if available
stats_table = None
stats = None
try:
stats_table = html.find('table.profile-stats')[0]
stats = stats_table.find('td div.statnum')
if not stats:
self.tweets_count = None
self.following_count = None
self.followers_count = None
except:
self.tweets_count = None
self.following_count = None
self.followers_count = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
stats_table = None
stats = None
try:
stats_table = html.find('table.profile-stats')[0]
stats = stats_table.find('td div.statnum')
if not stats:
self.tweets_count = None
self.following_count = None
self.followers_count = None
except:
self.tweets_count = None
self.following_count = None
self.followers_count = None
try:
stats_table = html.find('table.profile-stats')[0]
stats = stats_table.find('td div.statnum')
except:
stats_table = None
stats = None

How about this?

Copy link
Author

@yimingStar yimingStar Sep 29, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@aldnav Thank you for your suggestion. Your suggestion is definitely better!
I just make an adjustment in this refactor.
The fixed is in this comment f13ec7f


# get total tweets count if available
try:
q = html.find('li[class*="--tweets"] span[data-count]')[0].attrs["data-count"]
self.tweets_count = int(q)
self.tweets_count = int(stats[0].text.replace(',',''))
except:
self.tweets_count = None

# get total following count if available
try:
q = html.find('li[class*="--following"] span[data-count]')[0].attrs["data-count"]
self.following_count = int(q)
self.following_count = int(stats[1].text.replace(',',''))
except:
self.following_count = None

# get total follower count if available
try:
q = html.find('li[class*="--followers"] span[data-count]')[0].attrs["data-count"]
self.followers_count = int(q)
self.followers_count = int(stats[2].text.replace(',',''))
except:
self.followers_count = None

# get total like count if available
# TODO unfixed
try:
q = html.find('li[class*="--favorites"] span[data-count]')[0].attrs["data-count"]
self.likes_count = int(q)
Expand Down