Blog, Open Source, SearchHub, Tutorials und Dokumentation

How to Get Started in Payloads

by Lucidworks
April 18, 2010

In Lucid Imagination » Getting Started with Payloads, I introduced the basics of payloads, but that article is now slightly out of date if you are using Lucene 3.x, so I thought I would give an update. The biggest change is on the query side in that the BoostingTermQuery (BTQ) has been (deprecated and) removed in favor of the PayloadTermQuery (we felt the name better reflects what it does). We also slightly changed the Similarity.scorePayload signature to give more information.

Finally, the BoostingTermQuery used to take all of the payloads on a matching document and sums them up and then averages them. We have now made this pluggable by introducing the notion of a PayloadFunction and several implementations: AveragePayloadFunction (what the BTQ used to do), MinPayloadFunction, MaxPayloadFunction. Of course, if you want your own, just extend the PayloadFunction and pass it in.

The new code should look like:

package com.lucidimagination.noodles;

import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
import org.apache.lucene.analysis.payloads.FloatEncoder;
import org.apache.lucene.analysis.payloads.PayloadEncoder;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.payloads.AveragePayloadFunction;
import org.apache.lucene.search.payloads.PayloadTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

import java.io.IOException;
import java.io.Reader;

/**
 *
 *
 **/
public class PayloadTest extends TestCase {
  Directory dir;

  public static String[] DOCS = {
          "The quick|2.0 red|2.0 fox|10.0 jumped|5.0 over the lazy|2.0 brown|2.0 dogs|10.0",
          "The quick red fox jumped over the lazy brown dogs",//no boosts
          "The quick|2.0 red|2.0 fox|10.0 jumped|5.0 over the old|2.0 brown|2.0 box|10.0",
          "Mary|10.0 had a little|2.0 lamb|10.0 whose fleece|10.0 was|5.0 white|2.0 as snow|10.0",
          "Mary had a little lamb whose fleece was white as snow",
          "Mary|10.0 takes on Wolf|10.0 Restoration|10.0 project|10.0 despite ties|10.0 to sheep|10.0 farming|10.0",
          "Mary|10.0 who lives|5.0 on a farm|10.0 is|5.0 happy|2.0 that she|10.0 takes|5.0 a walk|10.0 every day|10.0",
          "Moby|10.0 Dick|10.0 is|5.0 a story|10.0 of a whale|10.0 and a man|10.0 obsessed|10.0",
          "The robber|10.0 wore|5.0 a black|2.0 fleece|10.0 jacket|10.0 and a baseball|10.0 cap|10.0",
          "The English|10.0 Springer|10.0 Spaniel|10.0 is|5.0 the best|2.0 of all dogs|10.0"
  };
  protected PayloadSimilarity payloadSimilarity;

  @Override
  protected void setUp() throws Exception {
    dir = new RAMDirectory();

    PayloadEncoder encoder = new FloatEncoder();
    IndexWriter writer = new IndexWriter(dir, new PayloadAnalyzer(encoder), true, IndexWriter.MaxFieldLength.UNLIMITED);
    payloadSimilarity = new PayloadSimilarity();
    writer.setSimilarity(payloadSimilarity);
    for (int i = 0; i < DOCS.length; i++) {
      Document doc = new Document();
      Field id = new Field("id", "doc_" + i, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
      doc.add(id);
      //Store both position and offset information
      Field text = new Field("body", DOCS[i], Field.Store.NO, Field.Index.ANALYZED);
      doc.add(text);
      writer.addDocument(doc);
    }
    writer.close();
  }

  public void testPayloads() throws Exception {
    IndexSearcher searcher = new IndexSearcher(dir, true);
    searcher.setSimilarity(payloadSimilarity);//set the similarity.  Very important
    PayloadTermQuery btq = new PayloadTermQuery(new Term("body", "fox"), new AveragePayloadFunction());// was BoostingTermQuery(new Term("body", "fox"));
    TopDocs topDocs = searcher.search(btq, 10);
    printResults(searcher, btq, topDocs);

    TermQuery tq = new TermQuery(new Term("body", "fox"));
    topDocs = searcher.search(tq, 10);
    printResults(searcher, tq, topDocs);
  }

  private void printResults(IndexSearcher searcher, Query query, TopDocs topDocs) throws IOException {
    System.out.println("-----------");
    System.out.println("Results for " + query + " of type: " + query.getClass().getName());
    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
      ScoreDoc doc = topDocs.scoreDocs[i];
      System.out.println("Doc: " + doc.toString());
      System.out.println("Explain: " + searcher.explain(query, doc.doc));
    }
  }

  class PayloadSimilarity extends DefaultSimilarity {
    //Note the new similarity signature, giving much more information about the field name, etc.
    @Override
    public float scorePayload(int docId, String fieldName, int start, int end, byte[] payload, int offset, int length) {
      return PayloadHelper.decodeFloat(payload, offset);//we can ignore length here, because we know it is encoded as 4 bytes
    }

  }

  class PayloadAnalyzer extends Analyzer {
    private PayloadEncoder encoder;

    PayloadAnalyzer(PayloadEncoder encoder) {
      this.encoder = encoder;
    }

    public TokenStream tokenStream(String fieldName, Reader reader) {
      TokenStream result = new WhitespaceTokenizer(reader);
      result = new LowerCaseFilter(result);
      result = new DelimitedPayloadTokenFilter(result, '|', encoder);
      return result;
    }
  }
}

About Lucidworks

LEARN MORE

Contact us today to learn how Lucidworks can help your team create powerful search and discovery applications for your customers and employees.

Lucidworks-Plattform – Übersicht

Lucidworks-Plattform – Preisgestaltung

KI-Zentrum

FUNKTIONEN VON LUCIDWORKS (ALLES INKLUSIVE)

Produktentdeckung

Searchandising

Websitesuche

Suche am Arbeitsplatz

Daten aufnehmen und Signale erfassen

Sucherlebnis der Mitarbeitenden

Kundenservice und Lösung von Fällen

KI und Large Language Models

LÖSUNGEN

Commerce

Kundenservice

Wissensmanagement

BRANCHEN

B2B-Commerce und -Vertrieb

B2B-Fertigung

Einzelhandel

Regierungsbehörden und öffentlicher Sektor

Gesundheitswesen

Finanzdienstleistungen

B2B Core Package

ENTDECKEN SIE UNSERE INHALTE

E-Books und Berichte

Blog

Videos

Presse

RESSOURCEN

Über Lucidworks

Dokumentation

Karriere

LucidAcademy

Kontakt

Technischer Support

How to Get Started in Payloads

About Lucidworks

LEARN MORE

Lucidworks-Plattform – Übersicht

Lucidworks-Plattform – Preisgestaltung

KI-Zentrum

FUNKTIONEN VON LUCIDWORKS (ALLES INKLUSIVE)

Produktentdeckung

Searchandising

Websitesuche

Suche am Arbeitsplatz

Daten aufnehmen und Signale erfassen

Sucherlebnis der Mitarbeitenden

Kundenservice und Lösung von Fällen

KI und Large Language Models

LÖSUNGEN

Commerce

Kundenservice

Wissensmanagement

BRANCHEN

B2B-Commerce und -Vertrieb

B2B-Fertigung

Einzelhandel

Regierungsbehörden und öffentlicher Sektor

Gesundheitswesen

Finanzdienstleistungen

B2B Core Package

ENTDECKEN SIE UNSERE INHALTE

E-Books und Berichte

Blog

Videos

Presse

RESSOURCEN

Über Lucidworks

Dokumentation

Karriere

LucidAcademy

Kontakt

Technischer Support

About Lucidworks

Related Articles

LEARN MORE