Use of LibreOffice

This page is generated by Machine Translation from Japanese.

Use of OpenOffice/LibreOffice

It is possible to crawl using the Apache POI Fess environmental standard in MS Office system document. You can crawl Office system document regarding LibreOffice, OpenOffice, do even more accurate text extraction from documents.

How to set up

JodConverter Fess server install. from http://jodconverter.googlecode.com/jodconverter-core-3.0-Beta-4-Dist.zipThe download. Expand and copy the jar file to Fess server.

unzip jodconverter-core-3.0-beta-4-dist.zip
cp jodconverter-core-3.0-beta-4/lib/juh-3.2.1.jar jodconverter-core-3.0-beta-4/lib/jurt-3.2.1.jar jodconverter-core-3.0-beta-4/lib/ridl-3.2.1.jar jodconverter-core-3.0-beta-4/lib/unoil-3.2.1.jar jodconverter-core-3.0-beta-4/lib/jodconverter-core-3.0-beta-4.jar fess-server-6.0.0/webapps/fess/WEB-INF/cmd/lib/
cd fess-server-6.0.0/

Create a s2robot_extractor.dicon to the next.

vi webapps/fess/WEB-INF/classes/s2robot_extractor.dicon

s2robot_extractor.dicon effective jodExtractor with following contents.

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE components PUBLIC "-//SEASAR//DTD S2Container 2.4//EN"
    "http://www.seasar.org/dtd/components24.dtd">
<components>
    <component name="tikaExtractor" class="org.seasar.robot.extractor.impl.TikaExtractor"/>
    <component name="msWordExtractor" class="org.seasar.robot.extractor.impl.MsWordExtractor"/>
    <component name="msExcelExtractor" class="org.seasar.robot.extractor.impl.MsExcelExtractor"/>
    <component name="msPowerPointExtractor" class="org.seasar.robot.extractor.impl.MsPowerPointExtractor"/>
    <component name="msPublisherExtractor" class="org.seasar.robot.extractor.impl.MsPublisherExtractor"/>
    <component name="msVisioExtractor" class="org.seasar.robot.extractor.impl.MsVisioExtractor"/>
    <component name="pdfExtractor" class="org.seasar.robot.extractor.impl.PdfExtractor"/>
    <component name="textExtractor" class="org.seasar.robot.extractor.impl.TextExtractor"/>
    <component name="htmlExtractor" class="org.seasar.robot.extractor.impl.HtmlExtractor"/>
    <component name="xmlExtractor" class="org.seasar.robot.extractor.impl.XmlExtractor"/>
    <component name="htmlXpathExtractor" class="org.seasar.robot.extractor.impl.HtmlXpathExtractor">
        <initMethod name="addFeature">
            <arg>"http://xml.org/sax/features/namespaces"</arg>
            <arg>"false"</arg>
        </initMethod>
    </component>
    <component name="officeManagerConfiguration"
        class="org.artofsolving.jodconverter.office.DefaultOfficeManagerConfiguration">
    </component>
    <component name="jodExtractor"
        class="org.seasar.robot.extractor.impl.JodExtractor">
        <property name="officeManager">
            officeManagerConfiguration.setOfficeHome("/usr/lib/libreoffice").buildOfficeManager()
        </property>
    </component>


    <component name="extractorFactory" class="org.seasar.robot.extractor.ExtractorFactory">
        <initMethod name="addExtractor">
            <arg>{
"application/msword",
"application/vnd.ms-excel",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
            }</arg>
            <arg>jodExtractor</arg>
        </initMethod>
        <initMethod name="addExtractor">
            <arg>{
"application/xml",
"application/xhtml+xml",
"application/rdf+xml",
"application/x-freemind",
"text/xml",
"text/xml-external-parsed-entity"
            }</arg>
            <arg>xmlExtractor</arg>
        </initMethod>
        <initMethod name="addExtractor">
            <arg>{
"text/html"
            }</arg>
            <arg>htmlExtractor</arg>
        </initMethod>
        <initMethod name="addExtractor">
            <arg>{
"application/pdf"
            }</arg>
            <arg>pdfExtractor</arg>
        </initMethod>
        <initMethod name="addExtractor">
            <arg>{
"image/svg+xml",
"application/x-tika-msoffice",
"application/vnd.visio",
"application/vnd.ms-excel.sheet.binary.macroenabled.12",
"application/vnd.ms-outlook",
"application/x-tika-ooxml",
"application/vnd.ms-powerpoint.presentation.macroenabled.12",
"application/vnd.openxmlformats-officedocument.presentationml.template",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
"application/vnd.ms-powerpoint.slideshow.macroenabled.12",
"application/vnd.ms-powerpoint.addin.macroenabled.12",
"application/vnd.ms-excel.sheet.macroenabled.12",
"application/vnd.openxmlformats-officedocument.spreadsheetml.template",
"application/vnd.ms-excel.template.macroenabled.12",
"application/vnd.ms-excel.addin.macroenabled.12",
"application/vnd.ms-word.document.macroenabled.12",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
"application/vnd.ms-word.template.macroenabled.12",
"application/x-asp",
"application/rtf",
"text/plain",
"application/vnd.sun.xml.writer",
"application/vnd.oasis.opendocument.text",
"application/vnd.oasis.opendocument.graphics",
"application/vnd.oasis.opendocument.presentation",
"application/vnd.oasis.opendocument.spreadsheet",
"application/vnd.oasis.opendocument.chart",
"application/vnd.oasis.opendocument.image",
"application/vnd.oasis.opendocument.formula",
"application/vnd.oasis.opendocument.text-master",
"application/vnd.oasis.opendocument.text-web",
"application/vnd.oasis.opendocument.text-template",
"application/vnd.oasis.opendocument.graphics-template",
"application/vnd.oasis.opendocument.presentation-template",
"application/vnd.oasis.opendocument.spreadsheet-template",
"application/vnd.oasis.opendocument.chart-template",
"application/vnd.oasis.opendocument.image-template",
"application/vnd.oasis.opendocument.formula-template",
"application/x-vnd.oasis.opendocument.text",
"application/x-vnd.oasis.opendocument.graphics",
"application/x-vnd.oasis.opendocument.presentation",
"application/x-vnd.oasis.opendocument.spreadsheet",
"application/x-vnd.oasis.opendocument.chart",
"application/x-vnd.oasis.opendocument.image",
"application/x-vnd.oasis.opendocument.formula",
"application/x-vnd.oasis.opendocument.text-master",
"application/x-vnd.oasis.opendocument.text-web",
"application/x-vnd.oasis.opendocument.text-template",
"application/x-vnd.oasis.opendocument.graphics-template",
"application/x-vnd.oasis.opendocument.presentation-template",
"application/x-vnd.oasis.opendocument.spreadsheet-template",
"application/x-vnd.oasis.opendocument.chart-template",
"application/x-vnd.oasis.opendocument.image-template",
"application/x-vnd.oasis.opendocument.formula-template",
"image/bmp",
"image/gif",
"image/jpeg",
"image/png",
"image/tiff",
"image/vnd.wap.wbmp",
"image/x-icon",
"image/x-psd",
"image/x-xcf",
"application/zip",
"application/x-tar",
"application/x-gtar",
"application/x-gzip",
"application/x-bzip",
"application/x-bzip2",
"application/java-vm",
"audio/mpeg",
"application/x-midi",
"audio/midi",
"audio/basic",
"audio/x-wav",
"audio/x-aiff",
"application/mbox",
"text/calendar",
"text/css",
"text/csv",
"text/directory",
"text/dns",
"text/ecmascript",
"text/enriched",
"text/example",
"text/javascript",
"text/parityfec",
"text/prs.fallenstein.rst",
"text/prs.lines.tag",
"text/red",
"text/rfc822-headers",
"text/richtext",
"text/rtf",
"text/rtp-enc-aescm128",
"text/rtx",
"text/sgml",
"text/t140",
"text/tab-separated-values",
"text/troff",
"text/ulpfec",
"text/uri-list",
"text/vnd.abc",
"text/vnd.curl",
"text/vnd.curl.dcurl",
"text/vnd.curl.mcurl",
"text/vnd.curl.scurl",
"text/vnd.dmclientscript",
"text/vnd.esmertec.theme-descriptor",
"text/vnd.fly",
"text/vnd.fmi.flexstor",
"text/vnd.graphviz",
"text/vnd.in3d.3dml",
"text/vnd.in3d.spot",
"text/vnd.iptc.newsml",
"text/vnd.iptc.nitf",
"text/vnd.latex-z",
"text/vnd.motorola.reflex",
"text/vnd.ms-mediapackage",
"text/vnd.net2phone.commcenter.command",
"text/vnd.si.uricatalogue",
"text/vnd.sun.j2me.app-descriptor",
"text/vnd.trolltech.linguist",
"text/vnd.wap.si",
"text/vnd.wap.sl",
"text/vnd.wap.wml",
"text/vnd.wap.wmlscript",
"text/x-asm",
"text/x-c",
"text/x-diff",
"text/x-fortran",
"text/x-java-source",
"text/x-pascal",
"text/x-setext",
"text/x-uuencode",
"text/x-vcalendar",
"text/x-vcard",
"application/x-sh"
            }</arg>
            <arg>tikaExtractor</arg>
        </initMethod>
    </component>
</components>

Index to generate the settings later, usually crawled into the street.