# Getting started with PyLucene # Easy example, largely cribbed from http://www.inkdroid.org/talks/pylucene/ # Includes highlighting -- displaying keyword extracts from the document. from PyLucene import Field, Document, StandardAnalyzer, FSDirectory, \ IndexWriter, Field, IndexSearcher, QueryParser, Highlighter, QueryScorer, \ StringReader store = FSDirectory.getDirectory( "pylucene-index", True ) writer = IndexWriter( store, StandardAnalyzer(), True ) d = Document() d.add ( Field.Text("body","""Some searchable text about me Comments on Christopher Schmidt: Web Development Services Copyright 2003-2005, Christopher Schmidt""")) d.add(Field.Keyword('url', 'http://example.com/foo1')) writer.addDocument( d ) d = Document() d.add ( Field.Text("body","""about me, about you""")) d.add(Field.Keyword('url', 'http://example.com/foo')) writer.addDocument( d ) d = Document() d.add ( Field.Text("body",""" This is a lot of text that does nto contain the word that I am going to search for. This is because the word that I a mgoing to search for should not be in here so that I can test that queries are actually ranked. about This release adds options for encoding (thanks to Nicko Cadell). An "Encoder" implementation such as the new SimpleHTMLEncoder class can be passed to the highlighter to encode all those non-xhtml standard characters such as & into legal values. This simple class may not suffice for about some languages - Commons Lang has an implementation that could be used: escapeHtml(String) in http://svn.apache.org/viewcvs.cgi/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java?rev=137958&view=markup """)) d.add(Field.Keyword('url', 'http://example.com/foo2')) writer.addDocument( d ) writer.close() string = "about" directory = FSDirectory.getDirectory( 'pylucene-index', False ) searcher = IndexSearcher( directory ) query = QueryParser.parse( string, 'body', StandardAnalyzer() ) hits = searcher.search( query ) highlighter = Highlighter(QueryScorer (query)) analyzer = StandardAnalyzer() for i in range(0,hits.length()): doc = hits.doc(i) text = doc.getField('body').stringValue() ts = analyzer.tokenStream("body", StringReader(text)) print highlighter.getBestFragments(ts, text, 3, "...").replace("\n", " ") #print "%s" % doc.getField('body').stringValue() #print "URL: %s" % doc.getField('url').stringValue() print "Score:", hits.score(i) print