#!/usr/bin/env python
"""Parse LiveJournal's Atom feed of public posts for 'interesting' articles.

See http://brad.livejournal.com/2143713.html for details.

You can write your own callback handler; the RegexMatcher is
just an example.  All your class needs is a scan() method that
takes a dict as an argument.  The dict keys are

    title, content, link, time

The title and content can be in UTF-8; the time is a unix
timestamp.

- we ignore stuff until we come across an <entry> tag
- store the entry's link href, title, and content (body)
- when we see an </entry>, we call the callback object's scan()
  method
"""

import re
import string
import sys
import time
import urllib
import xml.sax.handler
import xml.sax.xmlreader

__author__ = 'Faried Nawaz -- faried@gmail.com -- http://www.hungry.com/~fn/'
__version__ = '20060302'
__license__ = 'This code is in the public domain.'



class LJDocumentHandler(xml.sax.handler.ContentHandler):
    """Parse LJ's feed."""

    def __init__(self, outfile, callbackObject):
        self.outfile = outfile
        self.callbackObject = callbackObject
        self.inEntry = False
        self.currElement = None
        self.contents = {}
        self.bytesRead = 0L

    def say(self, something):
        self.outfile.write(something)
        self.outfile.flush()

    def startDocument(self):
        # self.say('document start\n')
        pass

    def endDocument(self):
        self.say('document end\n')

    def startElement(self, name, attrs):
        if name == 'entry':
            self.inEntry = True
        elif name == 'link':
            if self.inEntry:
                self.currElement = 'link'
                self.contents[self.currElement] = attrs.getValue('href')
        elif name == 'title':
            if self.inEntry:
                self.currElement = 'title'
                self.contents[self.currElement] = ''
        elif name == 'content':
            if self.inEntry:
                self.currElement = 'content'
                self.contents[self.currElement] = ''

    def endElement(self, name):
        if name == 'entry':
            self.inEntry = False
            self.callbackObject.scan(self.contents)
            self.currElement = None
        elif name == 'link':
            if self.inEntry:
                self.currElement = None
        elif name == 'title':
            if self.inEntry:
                self.currElement = None
        elif name == 'content':
            if self.inEntry:
                self.currElement = None

    def characters(self, chars):
        self.bytesRead = self.bytesRead + len(chars)
        if self.currElement:
            self.contents[self.currElement] = self.contents[self.currElement] + chars

    def bytes(self):
        self.say('total bytes in this session: %ld\n' % (self.bytesRead))


class RegexMatcher(object):
    """Search using regular expressions."""

    def __init__(self, outfile, regexdict):
        self.outfile = outfile
        self.regexs = regexdict

    def say(self, something):
        self.outfile.write(something)
        self.outfile.flush()
    
    def scan(self, data):
        for regname, regex in self.regexs.items():
            if regex.search(data['title']):
                self.say('%s title match @ %s: %s\n' %
                         (regname, time.ctime(), data['link']))
            elif regex.search(data['content']):
                self.say('%s body match @ %s: %s\n' %
                         (regname, time.ctime(), data['link']))



def r(reg):
    """Wrapper around re.compile"""

    return re.compile(reg, re.IGNORECASE)



def main():
    """Fire it up."""

    # key is a label, value is a compiled regex
    regexs = {
        'aq' : r(r"([uo]sama )?b[ie]n lad[ie]n|q(u)?a[ie]da|zawahri|taliban|m[ou]lla(h)? [ou]m[ae]r"),
        'wat' : r(r"war (against|on) terror|terrorism|terrorist|jihadi"),
        'ir' : r(r"\biran"),
        'fn' : r(r"faried|fn@hungry")    # ah...vanity
        }

    regexmatcher = RegexMatcher(sys.stdout, regexs)
    handler = LJDocumentHandler(sys.stderr, regexmatcher)
    while 1:
        parser = None
        u = urllib.urlopen('http://danga.com:8081/atom-stream.xml')
        # u = urllib.urlopen('http://updates.sixapart.com/atom-stream.xml')
        while 1:
            line = u.readline()
            if line.find('<?xml version="1.0" encoding="utf-8"?>') == 0:
                # reset or create the parser
                parser = xml.sax.make_parser(['IncrementalParser'])
                parser.setContentHandler(handler)
            elif line.find('</feed>') == 0:
                # end of entry
                parser = None
            elif line.find('<time>') == 0:
                pass
            elif line.find('<sorryTooSlow') == 0:
                print line
            else:
                if not parser:
                    # create if it doesn't already exit
                    parser = xml.sax.make_parser(['IncrementalParser'])
                    parser.setContentHandler(handler)
                try:
                    parser.feed(line)
                except xml.sax._exceptions.SAXParseException, e:
                    print 'sax parse exception', e.getMessage()
                    parser = None
                except KeyboardInterrupt:
                    handler.bytes()
                    parser = None
                    u.close()
                    sys.exit(0)


if __name__ == '__main__': main()
