warbo-utilities: 3074e9a9743bf355a99de83aaf9daf65ffea3b44

     1: #!/usr/bin/env python3
     2: try:
     3:     from BeautifulSoup import BeautifulSoup
     4: except ImportError:
     5:     from bs4 import BeautifulSoup
     6: import datetime
     7: import feedparser
     8: import hashlib
     9: import os
    10: import pickle
    11: import PyRSS2Gen
    12: import subprocess
    13: import sys
    14: import time
    15: import urllib.request
    16: 
    17: msg = lambda x: sys.stderr.write(x + '\n')
    18: 
    19: ###
    20: 
    21: identifier = 'story-body'
    22: def stripCrap(html):
    23:     """Removes navigation, sidebars, etc. from the given string of HTML."""
    24:     parsed_html = BeautifulSoup(html)
    25:     story       = parsed_html.body.find('div', attrs={'class':identifier})
    26:     if story is None:
    27:         return html
    28: 
    29:     for args in [
    30:         {'name':'ul', 'attrs':{'class':'sharetools'}},
    31:         {'name':'script'}
    32:     ]:
    33:         for crap in story.findAll(**args):
    34:             crap.extract()
    35: 
    36:     return '<html><head/><body>{0}</body></html>'.format(repr(story))
    37: 
    38: def testStripCrap():
    39:     import gzip
    40:     with gzip.open(os.getenv('HTML_EXAMPLE'), 'rb') as f:
    41:         content = f.read()
    42:     html = BeautifulSoup(stripCrap(content))
    43:     got  = html.body.div['class']
    44:     assert got == [identifier], repr({
    45:         'error'      : 'Expected top-level div to have identifier class',
    46:         'identifier' : identifier,
    47:         'got'        : got,
    48:         'html'       : html})
    49: 
    50: ###
    51: 
    52: def htmlToText(html):
    53:     """Render the given string of HTML to a plain text form."""
    54:     proc = subprocess.Popen(['html2text', '-nobs', '-ascii'],
    55:                             stdin=subprocess.PIPE,
    56:                             stdout=subprocess.PIPE)
    57:     out, _ = proc.communicate(stripCrap(html).encode())
    58:     return out
    59: 
    60: def testHtmlToText():
    61:     html = '<html><head /><body><p>Hello & goodbye!</p></body></html>'
    62:     text = htmlToText(html).decode()
    63:     want = 'Hello & goodbye!'
    64:     assert text.strip() == want.strip(), repr({
    65:         'error' : 'Did not render as expected',
    66:         'html'  : html,
    67:         'text'  : text.strip(),
    68:         'want'  : want.strip()})
    69: 
    70: ###
    71: 
    72: cache = '/tmp/bbcnews-cached'
    73: def getEntry(entry):
    74:     """Fetch the page linked to by the given entry."""
    75:     url  = entry.id
    76:     path = cache + '/00-' + hashlib.md5(url).hexdigest()
    77:     if os.path.exists(path):
    78:         with open(path, 'r') as f:
    79:             return pickle.load(f)
    80: 
    81:     msg('Fetching ' + url)
    82:     response = urllib.request.urlopen(url)
    83:     data     = {'url'     : response.geturl(),
    84:                 'content' : response.read()}
    85:     time.sleep(2)  # For courtesy
    86: 
    87:     if os.path.exists(cache):
    88:         msg('Caching to ' + path)
    89:         with open(path, 'w') as f:
    90:             pickle.dump(data, f)
    91:     return data
    92: 
    93: from functools import reduce
    94: def processEntry(entry):
    95:     """Replaces the content of entry with a rendered version of the page."""
    96:     escape = lambda x: reduce(lambda x, pair: x.replace(*pair),
    97:                               [('&', '&'),
    98:                                ('<', '<' ),
    99:                                ('>', '>' )],
   100:                               x)
   101: 
   102:     content       = getEntry(entry)
   103:     entry.summary = escape(htmlToText(stripCrap(content['content'])))
   104:     return entry
   105: 
   106: def renderToRss(feed):
   107:     """Render feedparser data to RSS, taken from
   108:     https://stackoverflow.com/a/191899/884682"""
   109:     def dateOf(x):
   110:         data = x.modified_parsed if hasattr(x, 'modified_parsed') else \
   111:                x.published_parsed
   112:         return datetime.datetime(*(data[:6]))
   113: 
   114:     items = [
   115:         PyRSS2Gen.RSSItem(
   116:             title       = x.title,
   117:             link        = x.link,
   118:             description = x.summary,
   119:             guid        = x.link,
   120:             pubDate     = dateOf(x))
   121:         for x in feed.entries]
   122: 
   123:     rss = PyRSS2Gen.RSS2(
   124:         title          = feed['feed'].get("title"),
   125:         link           = feed['feed'].get("link"),
   126:         description    = feed['feed'].get("description"),
   127: 
   128:         language       = feed['feed'].get("language"),
   129:         copyright      = feed['feed'].get("copyright"),
   130:         managingEditor = feed['feed'].get("managingEditor"),
   131:         webMaster      = feed['feed'].get("webMaster"),
   132:         pubDate        = feed['feed'].get("pubDate"),
   133:         lastBuildDate  = feed['feed'].get("lastBuildDate"),
   134: 
   135:         categories     = feed['feed'].get("categories"),
   136:         generator      = feed['feed'].get("generator"),
   137:         docs           = feed['feed'].get("docs"),
   138: 
   139:         items          = items)
   140: 
   141:     return rss.to_xml()
   142: 
   143: if os.getenv('RUN_TESTS') is None:
   144:     feed         = feedparser.parse(sys.stdin.read())
   145:     feed.entries = filter(lambda e: '/av/' not in getEntry(e)['url'],
   146:                           feed.entries)
   147:     feed.entries = list(map(processEntry, feed.entries))
   148:     print(renderToRss(feed))
   149: else:
   150:     testHtmlToText()
   151:     testStripCrap()

Generated by git2html.