Skip to content

Commit

Permalink
microformats2: attempt to refactor and improve extracting author from…
Browse files Browse the repository at this point in the history
… feeds

for #195. not fully functional yet. ugly.
  • Loading branch information
snarfed committed Apr 19, 2020
1 parent 061ca7c commit 8e190da
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 34 deletions.
6 changes: 1 addition & 5 deletions api.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,6 @@ def write_response(self, response, actor=None, url=None, title=None,
reader = self.request.get('reader', 'true').lower()
if reader not in ('true', 'false'):
self.abort(400, 'reader param must be either true or false')
if not actor and hfeed:
actor = microformats2.json_to_object({
'properties': hfeed.get('properties', {}),
})
self.response.out.write(atom.activities_to_atom(
activities, actor,
host_url=url or self.request.host_url + '/',
Expand All @@ -260,7 +256,7 @@ def write_response(self, response, actor=None, url=None, title=None,
title = 'Feed for %s' % url
self.response.out.write(rss.from_activities(
activities, actor, title=title,
feed_url=self.request.url, hfeed=hfeed,
feed_url=self.request.url, hfeed=hfeed, actor=actor,
home_page_url=util.base_url(url)))
elif format in ('as1-xml', 'xml'):
self.response.out.write(XML_TEMPLATE % util.to_xml(response))
Expand Down
20 changes: 7 additions & 13 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,10 @@ def get(self):
except (TypeError, ValueError):
raise exc.HTTPBadRequest('Could not decode %s as JSON' % final_url)

mf2 = None
soup = mf2 = None
if input == 'html':
mf2 = util.parse_mf2(resp, id=fragment)
soup = util.parse_html(resp)
mf2 = util.parse_mf2(soup, id=fragment)
if id and not mf2:
raise exc.HTTPBadRequest('Got fragment %s but no element found with that id.' % fragment)
elif input in ('mf2-json', 'json-mf2'):
Expand All @@ -206,18 +207,11 @@ def get(self):
mf2.__class__.__name__)
mf2.setdefault('rels', {}) # mf2util expects rels

actor = None
title = None
hfeed = None
if mf2:
def fetch_mf2_func(url):
if util.domain_or_parent_in(urllib.parse.urlparse(url).netloc, SILO_DOMAINS):
return {'items': [{'type': ['h-card'], 'properties': {'url': [url]}}]}
return util.fetch_mf2(url, gateway=True)

actor = title = hfeed = None
if soup:
try:
actor = microformats2.find_author(mf2, fetch_mf2_func=fetch_mf2_func)
title = microformats2.get_title(mf2)
actor = microformats2.find_feed_author(soup, url=final_url, mf2=mf2)
title = actor.get('displayName')
hfeed = mf2util.find_first_entry(mf2, ['h-feed'])
except (KeyError, ValueError) as e:
raise exc.HTTPBadRequest('Could not parse %s as %s: %s' % (final_url, input, e))
Expand Down
7 changes: 4 additions & 3 deletions granary/atom.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,13 +324,14 @@ def html_to_atom(html, url=None, fetch_author=False, reader=True):
if fetch_author:
assert url, 'fetch_author=True requires url!'

parsed = util.parse_mf2(html, url=url)
actor = microformats2.find_author(parsed, fetch_mf2_func=util.fetch_mf2)
soup = util.parse_html(html)
actor = microformats2.find_feed_author(soup, url=url)
mf2 = util.parse_mf2(soup, url=url)

return activities_to_atom(
microformats2.html_to_activities(html, url, actor),
actor,
title=microformats2.get_title(parsed),
title=microformats2.html_title(soup),
xml_base=util.base_url(url),
host_url=url,
reader=reader)
Expand Down
76 changes: 66 additions & 10 deletions granary/microformats2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1055,16 +1055,59 @@ def find_author(parsed, **kwargs):
parsed: dict, parsed mf2 object (ie return value from mf2py.parse())
kwargs: passed through to mf2util.find_author()
"""
author = mf2util.find_author(parsed, 'http://123', **kwargs)
if author:
photo = author.get('photo')
if isinstance(photo, dict):
photo = photo.get('url') or photo.get('value')
return {
'displayName': author.get('name'),
'url': author.get('url'),
'image': {'url': photo},
}
return author_to_actor(mf2util.find_author(parsed, 'http://123', **kwargs))


def find_feed_author(soup, url=None, mf2=None):
"""Returns the author of a feed page as a ActivityStreams actor dict.
Args:
soup: :class:`bs4.BeautifulSoup`, parsed HTML page
url: str, optional, URL of feed page
mf2: dict, optional, parsed mf2 object (ie return value from mf2py.parse())
Returns: dict, AS actor
"""
actor = {}

if not mf2:
mf2 = util.parse_mf2(soup)

feed = mf2util.find_first_entry(mf2, ['h-feed'])
if feed:
author = util.get_first(feed.get('properties', {}), 'author') or feed
actor = author_to_actor(mf2util.parse_author(author), ellipsize=True)

if not actor.get('displayName'):
actor['displayName'] = html_title(soup)

if not actor.get('url'):
actor['url'] = url

return actor


def author_to_actor(author, ellipsize=False):
"""Converts an mf2 author to an ActivityStreams actor.
Args:
author: dict, parsed mf2 author object, or None
ellipsize: boolean, whether to ellipsize the name if it's too long
Returns: dict, AS actor
"""
if not author:
return {}

prop = first_props(author.get('properties', {}))
photo = prop.get('photo')
if isinstance(photo, dict):
photo = photo.get('url') or photo.get('value')
return {
'displayName': prop.get('name'),
'url': prop.get('url'),
'image': {'url': photo},
}


def get_title(mf2):
Expand All @@ -1082,6 +1125,19 @@ def get_title(mf2):
return ''


def html_title(soup):
"""Returns the HTML <title> element's text contents, as a string, or None.
Args:
soup: :class:`bs4.BeautifulSoup`, parsed HTML page
"""
head = soup.head
if head:
title = head.title
if title:
return title.string


def first_props(props):
"""Converts a multiply-valued dict to singly valued.
Expand Down
7 changes: 4 additions & 3 deletions granary/rss.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,10 @@ def from_activities(activities, actor=None, title=None, feed_url=None,
fg.generator('granary', uri='https://granary.io/')

hfeed = hfeed or {}
actor = actor or {}
image = (util.get_url(hfeed.get('properties', {}), 'photo') or
util.get_url(actor, 'image'))
# XXX TODO
# actor = actor or microformats2.find_feed_author(hfeed) or {}
image = (util.get_url(actor, 'image') or
util.get_url(hfeed.get('properties', {}), 'photo'))
if image:
fg.image(image)

Expand Down
34 changes: 34 additions & 0 deletions granary/tests/test_atom.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,40 @@ def test_html_to_atom_fetch_author(self):
</author>
""", got, ignore_blanks=True)

def test_html_to_atom_hfeed_u_url(self):
self.assert_multiline_in("""\
<id>http://u/url</id>
<title>Page title</title>
""", atom.html_to_atom("""\
<html>
<head><title>Page title</title></head>
<div class="h-feed">
<a class="u-url" href="http://u/url"></a>
<article class="h-entry">
<p class="e-content">entry content</p>
</article>
</div>
</html>
""", 'https://my.site/feed'),
ignore_blanks=True)

def test_html_to_atom_title_page_url(self):
self.assert_multiline_in("""\
<id>http://my.site/feed</id>
<title>Page title</title>
""", atom.html_to_atom("""\
<html>
<head><title>Page title</title></head>
<div class="h-feed">
<span>unmarked feed title</span>
<article class="h-entry">
<p class="e-content">entry content</p>
</article>
</div>
</html>
""", 'https://my.site/feed'),
ignore_blanks=True)

def test_html_to_atom_title_without_hfeed_name(self):
self.assert_multiline_in("""\
<generator uri="https://granary.io/">granary</generator>
Expand Down

0 comments on commit 8e190da

Please sign in to comment.