feed2maildir: 562ee14128a0c9c318394af1c1e275eb8039b217

     1: import datetime
     2: import hashlib
     3: import json
     4: import os
     5: import random
     6: import sys
     7: import time
     8: 
     9: if sys.version[0] == '2':
    10:     from HTMLParser import HTMLParser
    11: else:
    12:     from html.parser import HTMLParser
    13: 
    14: import dateutil.parser
    15: 
    16: # Python 2.x compabitlity
    17: if sys.version[0] == '2':
    18:     FileNotFoundError = IOError
    19: 
    20: class HTMLStripper(HTMLParser):
    21:     """Strips HTML off an string"""
    22:     def __init__(self):
    23:         self.reset()
    24:         self.strict = False
    25:         self.fed = []
    26:         self.convert_charrefs = True
    27:         self.numlinks = 0
    28:         self.links = {}
    29: 
    30:     def handle_data(self, d):
    31:         self.fed.append(d)
    32: 
    33:     def handle_starttag(self, tag, attrs):
    34:         if tag == 'img':
    35:             for attr in attrs:
    36:                 if attr[0] == 'src':
    37:                     link = attr[1]
    38:                     break;
    39:             self.fed.append('[Image]: {}\n'.format(link))
    40:         elif tag == 'a':
    41:             for attr in attrs:
    42:                 if attr[0] == 'href':
    43:                     self.links[self.numlinks] = attr[1]
    44:         elif tag == 'li':
    45:             self.fed.append('- ')
    46: 
    47:     def handle_endtag(self, tag):
    48:         if tag == 'a':
    49:             self.fed.append(' [{}]'.format(self.numlinks))
    50:             self.numlinks += 1
    51: 
    52:     def get_data(self):
    53:         out = ''.join(self.fed)
    54:         if self.numlinks:
    55:             out += '\n'
    56:             for l in sorted(self.links.keys()):
    57:                 out += '  [{}]: {}\n'.format(l, self.links[l])
    58:         return out
    59: 
    60: class Converter:
    61:     """Converts new entries to maildir"""
    62: 
    63:     TEMPLATE = u"""MIME-Version: 1.0
    64: Date: {}
    65: Subject: {}
    66: From: {}
    67: Content-Type: text/plain
    68: X-feed2maildirsimple-hash: {}
    69: 
    70: Link: {}
    71: 
    72: {}
    73: """
    74: 
    75:     def __init__(self, maildir, name, strip=False, silent=False):
    76:         self.name    = name
    77:         self.silent  = silent
    78:         self.maildir = os.path.expanduser(maildir)
    79:         self.strip   = strip
    80:         self.delivered = 0
    81: 
    82:     def run(self):
    83:         """Do a full run"""
    84:         if self.feed:
    85:             hashes = self.check_maildir(self.maildir)
    86:             self.news = self.find_new(self.feed, hashes)
    87:             for newpost in self.news:
    88:                 self.write(self.compose(newpost))
    89: 
    90:     def load(self, feed):
    91:         """Load a feed"""
    92:         self.feed = feed
    93: 
    94:     def find_new(self, feed, hashes):
    95:         """Find the new posts by comparing them to the found hashes"""
    96:         new = []
    97: 
    98:         for post in feed.entries:
    99:             # See if we've already got a message for this item
   100:             h       = self.make_hash(post)
   101:             matches = [x for x in hashes if self.hashes_match(h, x)]
   102:             if matches == []:
   103:                 new.append(post)
   104:         return new
   105: 
   106:     def hashes_match(self, x, y):
   107:         """Check if the data in two hashes match"""
   108:         x_bits = [bit.split("PES") for bit in x.split("SEP")]
   109:         y_bits = [bit.split("PES") for bit in y.split("SEP")]
   110: 
   111:         x_data = {}
   112:         for k, v in x_bits:
   113:             x_data[k] = v.strip()
   114:         y_data = {}
   115:         for k, v in y_bits:
   116:             y_data[k] = v.strip()
   117: 
   118:         mismatch = False
   119:         for k in x_data:
   120:             if k in y_data:
   121:                 if x_data[k] != y_data[k]:
   122:                     mismatch = True
   123:         for k in y_data:
   124:             if k in x_data:
   125:                 if x_data[k] != y_data[k]:
   126:                     mismatch = True
   127: 
   128:         return (not mismatch)
   129: 
   130:     def make_hash(self, post):
   131:         """Make an identifying hash for this post"""
   132:         data = {"feed": self.name}
   133:         for k in ["id", "title", "ppg_canonical", "link", "author"]:
   134:             if k in post:
   135:                 h = hashlib.sha256()
   136:                 h.update(post[k].encode('utf-8'))
   137:                 data[k] = h.hexdigest()
   138:         return "SEP".join([k + "PES" + data[k] for k in sorted(data.keys())])
   139: 
   140:     def check_maildir(self, maildir):
   141:         """Check access to the maildir and try to create it if not present"""
   142:         mdirs = ('', '/tmp', '/new', '/cur')
   143:         for mdir in mdirs:
   144:             fullname = maildir + mdir
   145:             if not os.access(fullname, os.W_OK):
   146:                 try: # to make the maildirs
   147:                     os.mkdir(fullname)
   148:                 except:
   149:                     sys.exit('ERROR: accessing "{}" failed'.format(fullname))
   150: 
   151:         hashes = []
   152:         # Run a few times, to reduce the chance of missing something
   153:         for iteration in [0, 1, 2]:
   154:             # Look up all message filenames. These won't change, but they
   155:             # may be moved from 'new' to 'cur' while we're running.
   156:             messages = []
   157:             for subdir in ['new', 'cur']:
   158:                 messages += os.listdir(os.path.join(maildir, subdir))
   159:             for messagefile in list(set(messages)):
   160:                 # Look up the location of each message on demand, to prevent
   161:                 # our listings going stale
   162:                 foundfile = False
   163:                 for subdir in ['new', 'cur']:
   164:                     try:
   165:                         with open(os.path.join(maildir,
   166:                                                subdir,
   167:                                                messagefile),
   168:                                   'r') as message:
   169:                             foundfile = True
   170:                             found = [l for l in message.readlines()
   171:                                      if l.startswith('X-feed2maildirsimple-hash')]
   172:                             if found != []:
   173:                                 hashes.append(found[0].split(' ')[1])
   174:                     except IOError:
   175:                         # We only expect one to be found
   176:                         pass
   177:                 if not foundfile:
   178:                     print("WARNING: couldn't find {} in {}".format(
   179:                         messagefile,
   180:                         self.name))
   181:             time.sleep(1)
   182:         return list(set(hashes))
   183: 
   184:     def compose(self, post):
   185:         """Compose the mail using the tempate"""
   186:         try: # to get the update/publish time from the post
   187:             updated = post.updated
   188:         except: # the property is not set, use now()
   189:             updated = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())
   190:         desc = ''
   191:         if self.strip:
   192:             stripper = HTMLStripper()
   193:             stripper.feed(post.description)
   194:             desc = stripper.get_data()
   195:         else:
   196:             desc = post.description
   197:         return self.TEMPLATE.format(updated, post.title, self.name,
   198:                                     self.make_hash(post), post.link, desc)
   199: 
   200:     def write(self, message):
   201:         """Take a message and write it to a mail"""
   202:         rand = random.randint(0,0xFFFFFFFF)
   203:         dt = time.time()
   204:         ticks = int((dt - int(dt)) * 1000000)
   205:         pid = str(os.getpid())
   206:         host = os.uname()[1]
   207:         self.delivered += 1
   208:         name = u'{}/new/{}.M{}R{:08x}Q{}P{}.{}'.format(self.maildir, int(dt), ticks, rand, self.delivered, pid, host)
   209:         try: # to write out the message
   210:             with open(name, 'w') as f:
   211:                 # We can thank the P2/P3 unicode madness for this...
   212:                 if sys.version[0] == '2':
   213:                     f.write(str(message.encode('utf8')))
   214:                 else:
   215:                     f.write(message)
   216:         except:
   217:             self.output('WARNING: failed to write message to file')
   218: 
   219:     def mktime(self, arg):
   220:         """Make a datetime object from a time string"""
   221:         return dateutil.parser.parse(arg)
   222: 
   223:     def output(self, arg):
   224:         if not self.silent:
   225:             print(arg)

Generated by git2html.