feed2maildir: 562ee14128a0c9c318394af1c1e275eb8039b217
1: import datetime
2: import hashlib
3: import json
4: import os
5: import random
6: import sys
7: import time
8:
9: if sys.version[0] == '2':
10: from HTMLParser import HTMLParser
11: else:
12: from html.parser import HTMLParser
13:
14: import dateutil.parser
15:
16: # Python 2.x compabitlity
17: if sys.version[0] == '2':
18: FileNotFoundError = IOError
19:
20: class HTMLStripper(HTMLParser):
21: """Strips HTML off an string"""
22: def __init__(self):
23: self.reset()
24: self.strict = False
25: self.fed = []
26: self.convert_charrefs = True
27: self.numlinks = 0
28: self.links = {}
29:
30: def handle_data(self, d):
31: self.fed.append(d)
32:
33: def handle_starttag(self, tag, attrs):
34: if tag == 'img':
35: for attr in attrs:
36: if attr[0] == 'src':
37: link = attr[1]
38: break;
39: self.fed.append('[Image]: {}\n'.format(link))
40: elif tag == 'a':
41: for attr in attrs:
42: if attr[0] == 'href':
43: self.links[self.numlinks] = attr[1]
44: elif tag == 'li':
45: self.fed.append('- ')
46:
47: def handle_endtag(self, tag):
48: if tag == 'a':
49: self.fed.append(' [{}]'.format(self.numlinks))
50: self.numlinks += 1
51:
52: def get_data(self):
53: out = ''.join(self.fed)
54: if self.numlinks:
55: out += '\n'
56: for l in sorted(self.links.keys()):
57: out += ' [{}]: {}\n'.format(l, self.links[l])
58: return out
59:
60: class Converter:
61: """Converts new entries to maildir"""
62:
63: TEMPLATE = u"""MIME-Version: 1.0
64: Date: {}
65: Subject: {}
66: From: {}
67: Content-Type: text/plain
68: X-feed2maildirsimple-hash: {}
69:
70: Link: {}
71:
72: {}
73: """
74:
75: def __init__(self, maildir, name, strip=False, silent=False):
76: self.name = name
77: self.silent = silent
78: self.maildir = os.path.expanduser(maildir)
79: self.strip = strip
80: self.delivered = 0
81:
82: def run(self):
83: """Do a full run"""
84: if self.feed:
85: hashes = self.check_maildir(self.maildir)
86: self.news = self.find_new(self.feed, hashes)
87: for newpost in self.news:
88: self.write(self.compose(newpost))
89:
90: def load(self, feed):
91: """Load a feed"""
92: self.feed = feed
93:
94: def find_new(self, feed, hashes):
95: """Find the new posts by comparing them to the found hashes"""
96: new = []
97:
98: for post in feed.entries:
99: # See if we've already got a message for this item
100: h = self.make_hash(post)
101: matches = [x for x in hashes if self.hashes_match(h, x)]
102: if matches == []:
103: new.append(post)
104: return new
105:
106: def hashes_match(self, x, y):
107: """Check if the data in two hashes match"""
108: x_bits = [bit.split("PES") for bit in x.split("SEP")]
109: y_bits = [bit.split("PES") for bit in y.split("SEP")]
110:
111: x_data = {}
112: for k, v in x_bits:
113: x_data[k] = v.strip()
114: y_data = {}
115: for k, v in y_bits:
116: y_data[k] = v.strip()
117:
118: mismatch = False
119: for k in x_data:
120: if k in y_data:
121: if x_data[k] != y_data[k]:
122: mismatch = True
123: for k in y_data:
124: if k in x_data:
125: if x_data[k] != y_data[k]:
126: mismatch = True
127:
128: return (not mismatch)
129:
130: def make_hash(self, post):
131: """Make an identifying hash for this post"""
132: data = {"feed": self.name}
133: for k in ["id", "title", "ppg_canonical", "link", "author"]:
134: if k in post:
135: h = hashlib.sha256()
136: h.update(post[k].encode('utf-8'))
137: data[k] = h.hexdigest()
138: return "SEP".join([k + "PES" + data[k] for k in sorted(data.keys())])
139:
140: def check_maildir(self, maildir):
141: """Check access to the maildir and try to create it if not present"""
142: mdirs = ('', '/tmp', '/new', '/cur')
143: for mdir in mdirs:
144: fullname = maildir + mdir
145: if not os.access(fullname, os.W_OK):
146: try: # to make the maildirs
147: os.mkdir(fullname)
148: except:
149: sys.exit('ERROR: accessing "{}" failed'.format(fullname))
150:
151: hashes = []
152: # Run a few times, to reduce the chance of missing something
153: for iteration in [0, 1, 2]:
154: # Look up all message filenames. These won't change, but they
155: # may be moved from 'new' to 'cur' while we're running.
156: messages = []
157: for subdir in ['new', 'cur']:
158: messages += os.listdir(os.path.join(maildir, subdir))
159: for messagefile in list(set(messages)):
160: # Look up the location of each message on demand, to prevent
161: # our listings going stale
162: foundfile = False
163: for subdir in ['new', 'cur']:
164: try:
165: with open(os.path.join(maildir,
166: subdir,
167: messagefile),
168: 'r') as message:
169: foundfile = True
170: found = [l for l in message.readlines()
171: if l.startswith('X-feed2maildirsimple-hash')]
172: if found != []:
173: hashes.append(found[0].split(' ')[1])
174: except IOError:
175: # We only expect one to be found
176: pass
177: if not foundfile:
178: print("WARNING: couldn't find {} in {}".format(
179: messagefile,
180: self.name))
181: time.sleep(1)
182: return list(set(hashes))
183:
184: def compose(self, post):
185: """Compose the mail using the tempate"""
186: try: # to get the update/publish time from the post
187: updated = post.updated
188: except: # the property is not set, use now()
189: updated = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())
190: desc = ''
191: if self.strip:
192: stripper = HTMLStripper()
193: stripper.feed(post.description)
194: desc = stripper.get_data()
195: else:
196: desc = post.description
197: return self.TEMPLATE.format(updated, post.title, self.name,
198: self.make_hash(post), post.link, desc)
199:
200: def write(self, message):
201: """Take a message and write it to a mail"""
202: rand = random.randint(0,0xFFFFFFFF)
203: dt = time.time()
204: ticks = int((dt - int(dt)) * 1000000)
205: pid = str(os.getpid())
206: host = os.uname()[1]
207: self.delivered += 1
208: name = u'{}/new/{}.M{}R{:08x}Q{}P{}.{}'.format(self.maildir, int(dt), ticks, rand, self.delivered, pid, host)
209: try: # to write out the message
210: with open(name, 'w') as f:
211: # We can thank the P2/P3 unicode madness for this...
212: if sys.version[0] == '2':
213: f.write(str(message.encode('utf8')))
214: else:
215: f.write(message)
216: except:
217: self.output('WARNING: failed to write message to file')
218:
219: def mktime(self, arg):
220: """Make a datetime object from a time string"""
221: return dateutil.parser.parse(arg)
222:
223: def output(self, arg):
224: if not self.silent:
225: print(arg)
Generated by git2html.