microformats2: attempt to refactor and improve extracting author from…

… feeds for #195. not fully functional yet. ugly.
snarfed · Apr 19, 2020 · 8e190da · 8e190da
1 parent 061ca7c
commit 8e190da
Show file tree

Hide file tree

Showing 6 changed files with 116 additions and 34 deletions.
diff --git a/api.py b/api.py
@@ -240,10 +240,6 @@ def write_response(self, response, actor=None, url=None, title=None,
         reader = self.request.get('reader', 'true').lower()
         if reader not in ('true', 'false'):
           self.abort(400, 'reader param must be either true or false')
-        if not actor and hfeed:
-          actor = microformats2.json_to_object({
-            'properties': hfeed.get('properties', {}),
-          })
         self.response.out.write(atom.activities_to_atom(
           activities, actor,
           host_url=url or self.request.host_url + '/',
@@ -260,7 +256,7 @@ def write_response(self, response, actor=None, url=None, title=None,
           title = 'Feed for %s' % url
         self.response.out.write(rss.from_activities(
           activities, actor, title=title,
-          feed_url=self.request.url, hfeed=hfeed,
+          feed_url=self.request.url, hfeed=hfeed, actor=actor,
           home_page_url=util.base_url(url)))
       elif format in ('as1-xml', 'xml'):
         self.response.out.write(XML_TEMPLATE % util.to_xml(response))

diff --git a/app.py b/app.py
@@ -193,9 +193,10 @@ def get(self):
       except (TypeError, ValueError):
         raise exc.HTTPBadRequest('Could not decode %s as JSON' % final_url)
 
-    mf2 = None
+    soup = mf2 = None
     if input == 'html':
-      mf2 = util.parse_mf2(resp, id=fragment)
+      soup = util.parse_html(resp)
+      mf2 = util.parse_mf2(soup, id=fragment)
       if id and not mf2:
         raise exc.HTTPBadRequest('Got fragment %s but no element found with that id.' % fragment)
     elif input in ('mf2-json', 'json-mf2'):
@@ -206,18 +207,11 @@ def get(self):
           mf2.__class__.__name__)
       mf2.setdefault('rels', {})  # mf2util expects rels
 
-    actor = None
-    title = None
-    hfeed = None
-    if mf2:
-      def fetch_mf2_func(url):
-        if util.domain_or_parent_in(urllib.parse.urlparse(url).netloc, SILO_DOMAINS):
-          return {'items': [{'type': ['h-card'], 'properties': {'url': [url]}}]}
-        return util.fetch_mf2(url, gateway=True)
-
+    actor = title = hfeed = None
+    if soup:
       try:
-        actor = microformats2.find_author(mf2, fetch_mf2_func=fetch_mf2_func)
-        title = microformats2.get_title(mf2)
+        actor = microformats2.find_feed_author(soup, url=final_url, mf2=mf2)
+        title = actor.get('displayName')
         hfeed = mf2util.find_first_entry(mf2, ['h-feed'])
       except (KeyError, ValueError) as e:
         raise exc.HTTPBadRequest('Could not parse %s as %s: %s' % (final_url, input, e))

diff --git a/granary/atom.py b/granary/atom.py
@@ -324,13 +324,14 @@ def html_to_atom(html, url=None, fetch_author=False, reader=True):
   if fetch_author:
     assert url, 'fetch_author=True requires url!'
 
-  parsed = util.parse_mf2(html, url=url)
-  actor = microformats2.find_author(parsed, fetch_mf2_func=util.fetch_mf2)
+  soup = util.parse_html(html)
+  actor = microformats2.find_feed_author(soup, url=url)
+  mf2 = util.parse_mf2(soup, url=url)
 
   return activities_to_atom(
     microformats2.html_to_activities(html, url, actor),
     actor,
-    title=microformats2.get_title(parsed),
+    title=microformats2.html_title(soup),
     xml_base=util.base_url(url),
     host_url=url,
     reader=reader)

diff --git a/granary/microformats2.py b/granary/microformats2.py
@@ -1055,16 +1055,59 @@ def find_author(parsed, **kwargs):
     parsed: dict, parsed mf2 object (ie return value from mf2py.parse())
     kwargs: passed through to mf2util.find_author()
   """
-  author = mf2util.find_author(parsed, 'http://123', **kwargs)
-  if author:
-    photo = author.get('photo')
-    if isinstance(photo, dict):
-      photo = photo.get('url') or photo.get('value')
-    return {
-      'displayName': author.get('name'),
-      'url': author.get('url'),
-      'image': {'url': photo},
-    }
+  return author_to_actor(mf2util.find_author(parsed, 'http://123', **kwargs))
+
+
+def find_feed_author(soup, url=None, mf2=None):
+  """Returns the author of a feed page as a ActivityStreams actor dict.
+
+  Args:
+    soup: :class:`bs4.BeautifulSoup`, parsed HTML page
+    url: str, optional, URL of feed page
+    mf2: dict, optional, parsed mf2 object (ie return value from mf2py.parse())
+
+  Returns: dict, AS actor
+  """
+  actor = {}
+
+  if not mf2:
+    mf2 = util.parse_mf2(soup)
+
+  feed = mf2util.find_first_entry(mf2, ['h-feed'])
+  if feed:
+    author = util.get_first(feed.get('properties', {}), 'author') or feed
+    actor = author_to_actor(mf2util.parse_author(author), ellipsize=True)
+
+  if not actor.get('displayName'):
+    actor['displayName'] = html_title(soup)
+
+  if not actor.get('url'):
+    actor['url'] = url
+
+  return actor
+
+
+def author_to_actor(author, ellipsize=False):
+  """Converts an mf2 author to an ActivityStreams actor.
+
+  Args:
+    author: dict, parsed mf2 author object, or None
+    ellipsize: boolean, whether to ellipsize the name if it's too long
+
+  Returns: dict, AS actor
+  """
+  if not author:
+    return {}
+
+  prop = first_props(author.get('properties', {}))
+  photo = prop.get('photo')
+  if isinstance(photo, dict):
+    photo = photo.get('url') or photo.get('value')
+  return {
+    'displayName': prop.get('name'),
+    'url': prop.get('url'),
+    'image': {'url': photo},
+  }
 
 
 def get_title(mf2):
@@ -1082,6 +1125,19 @@ def get_title(mf2):
   return ''
 
 
+def html_title(soup):
+  """Returns the HTML <title> element's text contents, as a string, or None.
+
+  Args:
+    soup: :class:`bs4.BeautifulSoup`, parsed HTML page
+  """
+  head = soup.head
+  if head:
+    title = head.title
+    if title:
+      return title.string
+
+
 def first_props(props):
   """Converts a multiply-valued dict to singly valued.
 

diff --git a/granary/rss.py b/granary/rss.py
@@ -60,9 +60,10 @@ def from_activities(activities, actor=None, title=None, feed_url=None,
   fg.generator('granary', uri='https://granary.io/')
 
   hfeed = hfeed or {}
-  actor = actor or {}
-  image = (util.get_url(hfeed.get('properties', {}), 'photo') or
-           util.get_url(actor, 'image'))
+  # XXX TODO
+  # actor = actor or microformats2.find_feed_author(hfeed) or {}
+  image = (util.get_url(actor, 'image') or
+           util.get_url(hfeed.get('properties', {}), 'photo'))
   if image:
     fg.image(image)
 

diff --git a/granary/tests/test_atom.py b/granary/tests/test_atom.py
@@ -655,6 +655,40 @@ def test_html_to_atom_fetch_author(self):
 </author>
 """, got, ignore_blanks=True)
 
+  def test_html_to_atom_hfeed_u_url(self):
+    self.assert_multiline_in("""\
+<id>http://u/url</id>
+<title>Page title</title>
+""", atom.html_to_atom("""\
+<html>
+<head><title>Page title</title></head>
+<div class="h-feed">
+  <a class="u-url" href="http://u/url"></a>
+  <article class="h-entry">
+    <p class="e-content">entry content</p>
+  </article>
+</div>
+</html>
+""", 'https://my.site/feed'),
+    ignore_blanks=True)
+
+  def test_html_to_atom_title_page_url(self):
+    self.assert_multiline_in("""\
+<id>http://my.site/feed</id>
+<title>Page title</title>
+""", atom.html_to_atom("""\
+<html>
+<head><title>Page title</title></head>
+<div class="h-feed">
+  <span>unmarked feed title</span>
+  <article class="h-entry">
+    <p class="e-content">entry content</p>
+  </article>
+</div>
+</html>
+""", 'https://my.site/feed'),
+    ignore_blanks=True)
+
   def test_html_to_atom_title_without_hfeed_name(self):
     self.assert_multiline_in("""\
 <generator uri="https://granary.io/">granary</generator>