# Getting started with PyLucene
# Easy example, largely cribbed from http://www.inkdroid.org/talks/pylucene/

# Includes highlighting -- displaying keyword extracts from the document.

from PyLucene import Field, Document, StandardAnalyzer, FSDirectory, \
    IndexWriter, Field, IndexSearcher, QueryParser, Highlighter, QueryScorer, \
    StringReader

store = FSDirectory.getDirectory( "pylucene-index", True )
writer = IndexWriter( store, StandardAnalyzer(), True )

d = Document()
d.add ( Field.Text("body","""Some searchable text about me
Comments on Christopher Schmidt: Web Development Services
Copyright 2003-2005, Christopher Schmidt"""))
d.add(Field.Keyword('url', 'http://example.com/foo1'))

writer.addDocument( d )
d = Document()
d.add ( Field.Text("body","""about me, about you"""))
d.add(Field.Keyword('url', 'http://example.com/foo'))

writer.addDocument( d )

d = Document()
d.add ( Field.Text("body",""" This is a lot of text that does nto contain the word that I am going to search for. This is because the word that I a mgoing to search for should not be in here so that I can test that queries are actually ranked. about
This release adds options for encoding (thanks to Nicko Cadell).
An "Encoder" implementation such as the new SimpleHTMLEncoder class can be passed to the highlighter to encode
all those non-xhtml standard characters such as &amp; into legal values. This simple class may not suffice for
about
some languages -  Commons Lang has an implementation that could be used: escapeHtml(String) in
http://svn.apache.org/viewcvs.cgi/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java?rev=137958&view=markup
"""))
d.add(Field.Keyword('url', 'http://example.com/foo2'))
writer.addDocument( d )

writer.close()

string = "about" 
directory = FSDirectory.getDirectory( 'pylucene-index', False )
searcher = IndexSearcher( directory )
query = QueryParser.parse( string, 'body', StandardAnalyzer() )
hits = searcher.search( query )
highlighter = Highlighter(QueryScorer (query))

analyzer = StandardAnalyzer()

for i in range(0,hits.length()):
    doc = hits.doc(i)
    text = doc.getField('body').stringValue()
    ts = analyzer.tokenStream("body", StringReader(text))
    print highlighter.getBestFragments(ts, text, 3, "...").replace("\n", " ")
    #print "%s" % doc.getField('body').stringValue()
    #print "URL: %s" % doc.getField('url').stringValue()
    print "Score:", hits.score(i)
    print