| Author: | Ed Summers <ehs@pobox.com> |
|---|---|
| Version: | 1 |
from mailbox import UnixMailbox
from PyLucene import Field, Document, StandardAnalyzer, FSDirectory, \
IndexWriter
store = FSDirectory.getDirectory( "chipy-index", True )
writer = IndexWriter( store, StandardAnalyzer(), True )
mbox = UnixMailbox( open('chipy.mbox') )
while True:
msg = mbox.next()
if msg == None: break
writer.addDocument( EmailDoc(msg) )
writer.close()
from PyLucene import Document, Field
class EmailDoc( Document ):
def __init__( self, msg ):
Document.__init__( self )
sender = msg.getheader('From')
self.add( Field.Text( 'from', sender ) )
subject = msg.getheader( 'Subject' )
self.add( Field.Text( 'subject', subject ) )
body = msg.fp.read()
self.add( Field.Text( 'body', body ) )
id = msg.getheader('Message-ID')
self.add( Field.Keyword( 'id', id ) )
self.add( Field.Text( 'all', sender + subject + body ) )
from sys import argv
from PyLucene import FSDirectory, IndexSearcher, QueryParser, \
StandardAnalyzer
string = argv[1].strip()
directory = FSDirectory.getDirectory( 'chipy-index', False )
searcher = IndexSearcher( directory )
query = QueryParser.parse( string, 'all', StandardAnalyzer() )
hits = searcher.search( query )
for i in range(0,hits.length()):
doc = hits.doc(i)
print "ID: %s" % doc.getField('id').stringValue()
print "From: %s" % doc.getField('from').stringValue()
print "Subject: %s" % doc.getField('subject').stringValue()
print "Date: %s" % doc.getField('date').stringValue()
print
from sys import argv
from PyLucene import FSDirectory, IndexSearcher, TermQuery, Term
id = argv[1].strip()
directory = FSDirectory.getDirectory( 'chipy-index', False )
searcher = IndexSearcher( directory )
query = TermQuery( Term( 'id', id ) )
hits = searcher.search( query )
doc = hits.doc(0)
print "ID: %s" % doc.getField('id').stringValue()
print "From: %s" % doc.getField('from').stringValue()
print "Subject: %s" % doc.getField('subject').stringValue()
print "Date: %s" % doc.getField('date').stringValue()
print doc.getField('body').stringValue()
print
You can download the src code for these examples here. If you want an mbox to play with you can grab them from the chipy list archives.
Thanks to rst2s5 these slides were written in reStructuredText.