CODEFETCH™
            Examples
Cache of LuceneInAction/src/lia/handlingtypes/html/NekoHTMLHandler.java from
http://www.lucenebook.com/LuceneInAction.zip
Source code below from:
Lucene in Action (In Action series)
By Erik Hatcher and Otis Gospodnetic
Published 28 December, 2004
Average rating

      Powells     Alibris


package lia.handlingtypes.html;

import lia.handlingtypes.framework.DocumentHandler;
import lia.handlingtypes.framework.DocumentHandlerException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import org.cyberneko.html.parsers.DOMFragmentParser;

import org.xml.sax.SAXException;
import org.xml.sax.InputSource;

import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.DocumentFragment;

import org.apache.html.dom.HTMLDocumentImpl;

import java.io.InputStream;
import java.io.IOException;
import java.io.FileInputStream;
import java.io.File;

public class NekoHTMLHandler implements DocumentHandler {
  private DOMFragmentParser parser = new DOMFragmentParser();

  public Document getDocument(InputStream is)
    throws DocumentHandlerException {

    DocumentFragment node =
      new HTMLDocumentImpl().createDocumentFragment();
    try {
      parser.parse(new InputSource(is), node);
    }
    catch (IOException e) {
      throw new DocumentHandlerException(
        "Cannot parse HTML document: ", e);
    }
    catch (SAXException e) {
      throw new DocumentHandlerException(
        "Cannot parse HTML document: ", e);
    }

    org.apache.lucene.document.Document doc =
      new org.apache.lucene.document.Document();

    StringBuffer sb = new StringBuffer();
    getText(sb, node, "title");
    String title = sb.toString();

    sb.setLength(0);
    getText(sb, node);
    String text = sb.toString();

    if ((title != null) && (!title.equals(""))) {
      doc.add(Field.Text("title", title));
    }
    if ((text != null) && (!text.equals(""))) {
      doc.add(Field.Text("body", text));
    }

    return doc;
  }

  private void getText(StringBuffer sb, Node node) {
    if (node.getNodeType() == Node.TEXT_NODE) {
      sb.append(node.getNodeValue());
    }
    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        getText(sb, children.item(i));
      }
    }
  }

  private boolean getText(StringBuffer sb, Node node,
    String element) {
    if (node.getNodeType() == Node.ELEMENT_NODE) {
      if (element.equalsIgnoreCase(node.getNodeName())) {
        getText(sb, node);
        return true;
      }
    }
    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        if (getText(sb, children.item(i), element)) {
          return true;
        }
      }
    }
    return false;
  }

  public static void main(String args[]) throws Exception {
    NekoHTMLHandler handler = new NekoHTMLHandler();
    org.apache.lucene.document.Document doc = handler.getDocument(
      new FileInputStream(new File(args[0])));
    System.out.println(doc);
  }
}