#!/usr/bin/env python """Parse LiveJournal's Atom feed of public posts for 'interesting' articles. See http://brad.livejournal.com/2143713.html for details. You can write your own callback handler; the RegexMatcher is just an example. All your class needs is a scan() method that takes a dict as an argument. The dict keys are title, content, link, time The title and content can be in UTF-8; the time is a unix timestamp. - we ignore stuff until we come across an tag - store the entry's link href, title, and content (body) - when we see an , we call the callback object's scan() method """ import re import string import sys import time import urllib import xml.sax.handler import xml.sax.xmlreader __author__ = 'Faried Nawaz -- faried@gmail.com -- http://www.hungry.com/~fn/' __version__ = '20060302' __license__ = 'This code is in the public domain.' class LJDocumentHandler(xml.sax.handler.ContentHandler): """Parse LJ's feed.""" def __init__(self, outfile, callbackObject): self.outfile = outfile self.callbackObject = callbackObject self.inEntry = False self.currElement = None self.contents = {} self.bytesRead = 0L def say(self, something): self.outfile.write(something) self.outfile.flush() def startDocument(self): # self.say('document start\n') pass def endDocument(self): self.say('document end\n') def startElement(self, name, attrs): if name == 'entry': self.inEntry = True elif name == 'link': if self.inEntry: self.currElement = 'link' self.contents[self.currElement] = attrs.getValue('href') elif name == 'title': if self.inEntry: self.currElement = 'title' self.contents[self.currElement] = '' elif name == 'content': if self.inEntry: self.currElement = 'content' self.contents[self.currElement] = '' def endElement(self, name): if name == 'entry': self.inEntry = False self.callbackObject.scan(self.contents) self.currElement = None elif name == 'link': if self.inEntry: self.currElement = None elif name == 'title': if self.inEntry: self.currElement = None elif name == 'content': if self.inEntry: self.currElement = None def characters(self, chars): self.bytesRead = self.bytesRead + len(chars) if self.currElement: self.contents[self.currElement] = self.contents[self.currElement] + chars def bytes(self): self.say('total bytes in this session: %ld\n' % (self.bytesRead)) class RegexMatcher(object): """Search using regular expressions.""" def __init__(self, outfile, regexdict): self.outfile = outfile self.regexs = regexdict def say(self, something): self.outfile.write(something) self.outfile.flush() def scan(self, data): for regname, regex in self.regexs.items(): if regex.search(data['title']): self.say('%s title match @ %s: %s\n' % (regname, time.ctime(), data['link'])) elif regex.search(data['content']): self.say('%s body match @ %s: %s\n' % (regname, time.ctime(), data['link'])) def r(reg): """Wrapper around re.compile""" return re.compile(reg, re.IGNORECASE) def main(): """Fire it up.""" # key is a label, value is a compiled regex regexs = { 'aq' : r(r"([uo]sama )?b[ie]n lad[ie]n|q(u)?a[ie]da|zawahri|taliban|m[ou]lla(h)? [ou]m[ae]r"), 'wat' : r(r"war (against|on) terror|terrorism|terrorist|jihadi"), 'ir' : r(r"\biran"), 'fn' : r(r"faried|fn@hungry") # ah...vanity } regexmatcher = RegexMatcher(sys.stdout, regexs) handler = LJDocumentHandler(sys.stderr, regexmatcher) while 1: parser = None u = urllib.urlopen('http://danga.com:8081/atom-stream.xml') # u = urllib.urlopen('http://updates.sixapart.com/atom-stream.xml') while 1: line = u.readline() if line.find('') == 0: # reset or create the parser parser = xml.sax.make_parser(['IncrementalParser']) parser.setContentHandler(handler) elif line.find('') == 0: # end of entry parser = None elif line.find('