pylucene - search for python

import PyLucene

Author:Ed Summers <ehs@pobox.com>
Version:1

Search Technologies

Apache Lucene

pylucene

Indexing

Searching

Indexing an Mbox

Indexing Code

from mailbox import UnixMailbox
from PyLucene import Field, Document, StandardAnalyzer, FSDirectory, \
    IndexWriter

store = FSDirectory.getDirectory( "chipy-index", True )
writer = IndexWriter( store, StandardAnalyzer(), True )

mbox = UnixMailbox( open('chipy.mbox') )
while True:
    msg = mbox.next()
    if msg == None: break
    writer.addDocument( EmailDoc(msg) )

writer.close()

EmailDoc

from PyLucene import Document, Field

class EmailDoc( Document ):

    def __init__( self, msg ):
        Document.__init__( self )

        sender = msg.getheader('From')
        self.add( Field.Text( 'from', sender ) )

        subject = msg.getheader( 'Subject' )
        self.add( Field.Text( 'subject', subject ) )

        body = msg.fp.read()
        self.add( Field.Text( 'body', body ) )

        id = msg.getheader('Message-ID')
        self.add( Field.Keyword( 'id', id ) )

        self.add( Field.Text( 'all', sender + subject + body ) )

Searching the Email

Searching Code

from sys import argv
from PyLucene import FSDirectory, IndexSearcher, QueryParser, \
    StandardAnalyzer

string = argv[1].strip()
directory = FSDirectory.getDirectory( 'chipy-index', False )
searcher = IndexSearcher( directory )
query = QueryParser.parse( string, 'all', StandardAnalyzer() )
hits = searcher.search( query )

for i in range(0,hits.length()):
    doc = hits.doc(i)
    print "ID: %s" % doc.getField('id').stringValue()
    print "From: %s" % doc.getField('from').stringValue()
    print "Subject: %s" % doc.getField('subject').stringValue()
    print "Date: %s" % doc.getField('date').stringValue()
    print

Printing An Email

from sys import argv
from PyLucene import FSDirectory, IndexSearcher, TermQuery, Term

id = argv[1].strip()
directory = FSDirectory.getDirectory( 'chipy-index', False )
searcher = IndexSearcher( directory )
query = TermQuery( Term( 'id', id ) )
hits = searcher.search( query )

doc = hits.doc(0)
print "ID: %s" % doc.getField('id').stringValue()
print "From: %s" % doc.getField('from').stringValue()
print "Subject: %s" % doc.getField('subject').stringValue()
print "Date: %s" % doc.getField('date').stringValue()
print doc.getField('body').stringValue()
print 

Adios Amigos

You can download the src code for these examples here. If you want an mbox to play with you can grab them from the chipy list archives.

Thanks to rst2s5 these slides were written in reStructuredText.