warbo-utilities: 3074e9a9743bf355a99de83aaf9daf65ffea3b44
1: #!/usr/bin/env python3
2: try:
3: from BeautifulSoup import BeautifulSoup
4: except ImportError:
5: from bs4 import BeautifulSoup
6: import datetime
7: import feedparser
8: import hashlib
9: import os
10: import pickle
11: import PyRSS2Gen
12: import subprocess
13: import sys
14: import time
15: import urllib.request
16:
17: msg = lambda x: sys.stderr.write(x + '\n')
18:
19: ###
20:
21: identifier = 'story-body'
22: def stripCrap(html):
23: """Removes navigation, sidebars, etc. from the given string of HTML."""
24: parsed_html = BeautifulSoup(html)
25: story = parsed_html.body.find('div', attrs={'class':identifier})
26: if story is None:
27: return html
28:
29: for args in [
30: {'name':'ul', 'attrs':{'class':'sharetools'}},
31: {'name':'script'}
32: ]:
33: for crap in story.findAll(**args):
34: crap.extract()
35:
36: return '<html><head/><body>{0}</body></html>'.format(repr(story))
37:
38: def testStripCrap():
39: import gzip
40: with gzip.open(os.getenv('HTML_EXAMPLE'), 'rb') as f:
41: content = f.read()
42: html = BeautifulSoup(stripCrap(content))
43: got = html.body.div['class']
44: assert got == [identifier], repr({
45: 'error' : 'Expected top-level div to have identifier class',
46: 'identifier' : identifier,
47: 'got' : got,
48: 'html' : html})
49:
50: ###
51:
52: def htmlToText(html):
53: """Render the given string of HTML to a plain text form."""
54: proc = subprocess.Popen(['html2text', '-nobs', '-ascii'],
55: stdin=subprocess.PIPE,
56: stdout=subprocess.PIPE)
57: out, _ = proc.communicate(stripCrap(html).encode())
58: return out
59:
60: def testHtmlToText():
61: html = '<html><head /><body><p>Hello & goodbye!</p></body></html>'
62: text = htmlToText(html).decode()
63: want = 'Hello & goodbye!'
64: assert text.strip() == want.strip(), repr({
65: 'error' : 'Did not render as expected',
66: 'html' : html,
67: 'text' : text.strip(),
68: 'want' : want.strip()})
69:
70: ###
71:
72: cache = '/tmp/bbcnews-cached'
73: def getEntry(entry):
74: """Fetch the page linked to by the given entry."""
75: url = entry.id
76: path = cache + '/00-' + hashlib.md5(url).hexdigest()
77: if os.path.exists(path):
78: with open(path, 'r') as f:
79: return pickle.load(f)
80:
81: msg('Fetching ' + url)
82: response = urllib.request.urlopen(url)
83: data = {'url' : response.geturl(),
84: 'content' : response.read()}
85: time.sleep(2) # For courtesy
86:
87: if os.path.exists(cache):
88: msg('Caching to ' + path)
89: with open(path, 'w') as f:
90: pickle.dump(data, f)
91: return data
92:
93: from functools import reduce
94: def processEntry(entry):
95: """Replaces the content of entry with a rendered version of the page."""
96: escape = lambda x: reduce(lambda x, pair: x.replace(*pair),
97: [('&', '&'),
98: ('<', '<' ),
99: ('>', '>' )],
100: x)
101:
102: content = getEntry(entry)
103: entry.summary = escape(htmlToText(stripCrap(content['content'])))
104: return entry
105:
106: def renderToRss(feed):
107: """Render feedparser data to RSS, taken from
108: https://stackoverflow.com/a/191899/884682"""
109: def dateOf(x):
110: data = x.modified_parsed if hasattr(x, 'modified_parsed') else \
111: x.published_parsed
112: return datetime.datetime(*(data[:6]))
113:
114: items = [
115: PyRSS2Gen.RSSItem(
116: title = x.title,
117: link = x.link,
118: description = x.summary,
119: guid = x.link,
120: pubDate = dateOf(x))
121: for x in feed.entries]
122:
123: rss = PyRSS2Gen.RSS2(
124: title = feed['feed'].get("title"),
125: link = feed['feed'].get("link"),
126: description = feed['feed'].get("description"),
127:
128: language = feed['feed'].get("language"),
129: copyright = feed['feed'].get("copyright"),
130: managingEditor = feed['feed'].get("managingEditor"),
131: webMaster = feed['feed'].get("webMaster"),
132: pubDate = feed['feed'].get("pubDate"),
133: lastBuildDate = feed['feed'].get("lastBuildDate"),
134:
135: categories = feed['feed'].get("categories"),
136: generator = feed['feed'].get("generator"),
137: docs = feed['feed'].get("docs"),
138:
139: items = items)
140:
141: return rss.to_xml()
142:
143: if os.getenv('RUN_TESTS') is None:
144: feed = feedparser.parse(sys.stdin.read())
145: feed.entries = filter(lambda e: '/av/' not in getEntry(e)['url'],
146: feed.entries)
147: feed.entries = list(map(processEntry, feed.entries))
148: print(renderToRss(feed))
149: else:
150: testHtmlToText()
151: testStripCrap()
Generated by git2html.