LibreOfficeの利用

OpenOffice/LibreOfficeの利用

標準の Fess 環境において、Apache POI を用いた MS Office 系ドキュメントのクロールが可能です。 オフィス系ドキュメントのクロールに関して、OpenOfficeやLibreOfficeを利用して、ドキュメントからより高精度なテキスト抽出も行うことができます。

設定方法

JodConverter を Fess サーバーにインストールします。http://jodconverter.googlecode.com/ からjodconverter-core-3.0-beta-4-dist.zipをダウンロードします。展開して jar ファイルを Fess サーバーにコピーします。

unzip jodconverter-core-3.0-beta-4-dist.zip
cp jodconverter-core-3.0-beta-4/lib/juh-3.2.1.jar jodconverter-core-3.0-beta-4/lib/jurt-3.2.1.jar jodconverter-core-3.0-beta-4/lib/ridl-3.2.1.jar jodconverter-core-3.0-beta-4/lib/unoil-3.2.1.jar jodconverter-core-3.0-beta-4/lib/jodconverter-core-3.0-beta-4.jar fess-server-6.0.0/webapps/fess/WEB-INF/cmd/lib/
cd fess-server-6.0.0/

次にs2robot_extractor.diconを作成します。

vi webapps/fess/WEB-INF/classes/s2robot_extractor.dicon

s2robot_extractor.diconは以下のような内容でjodExtractorを有効にします。

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE components PUBLIC "-//SEASAR//DTD S2Container 2.4//EN"
    "http://www.seasar.org/dtd/components24.dtd">
<components>
    <component name="tikaExtractor" class="org.seasar.robot.extractor.impl.TikaExtractor"/>
    <component name="msWordExtractor" class="org.seasar.robot.extractor.impl.MsWordExtractor"/>
    <component name="msExcelExtractor" class="org.seasar.robot.extractor.impl.MsExcelExtractor"/>
    <component name="msPowerPointExtractor" class="org.seasar.robot.extractor.impl.MsPowerPointExtractor"/>
    <component name="msPublisherExtractor" class="org.seasar.robot.extractor.impl.MsPublisherExtractor"/>
    <component name="msVisioExtractor" class="org.seasar.robot.extractor.impl.MsVisioExtractor"/>
    <component name="pdfExtractor" class="org.seasar.robot.extractor.impl.PdfExtractor"/>
    <component name="textExtractor" class="org.seasar.robot.extractor.impl.TextExtractor"/>
    <component name="htmlExtractor" class="org.seasar.robot.extractor.impl.HtmlExtractor"/>
    <component name="xmlExtractor" class="org.seasar.robot.extractor.impl.XmlExtractor"/>
    <component name="htmlXpathExtractor" class="org.seasar.robot.extractor.impl.HtmlXpathExtractor">
        <initMethod name="addFeature">
            <arg>"http://xml.org/sax/features/namespaces"</arg>
            <arg>"false"</arg>
        </initMethod>
    </component>
    <component name="officeManagerConfiguration"
        class="org.artofsolving.jodconverter.office.DefaultOfficeManagerConfiguration">
    </component>
    <component name="jodExtractor"
        class="org.seasar.robot.extractor.impl.JodExtractor">
        <property name="officeManager">
            officeManagerConfiguration.setOfficeHome("/usr/lib/libreoffice").buildOfficeManager()
        </property>
    </component>


    <component name="extractorFactory" class="org.seasar.robot.extractor.ExtractorFactory">
        <initMethod name="addExtractor">
            <arg>{
"application/msword",
"application/vnd.ms-excel",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
            }</arg>
            <arg>jodExtractor</arg>
        </initMethod>
        <initMethod name="addExtractor">
            <arg>{
"application/xml",
"application/xhtml+xml",
"application/rdf+xml",
"application/x-freemind",
"text/xml",
"text/xml-external-parsed-entity"
            }</arg>
            <arg>xmlExtractor</arg>
        </initMethod>
        <initMethod name="addExtractor">
            <arg>{
"text/html"
            }</arg>
            <arg>htmlExtractor</arg>
        </initMethod>
        <initMethod name="addExtractor">
            <arg>{
"application/pdf"
            }</arg>
            <arg>pdfExtractor</arg>
        </initMethod>
        <initMethod name="addExtractor">
            <arg>{
"image/svg+xml",
"application/x-tika-msoffice",
"application/vnd.visio",
"application/vnd.ms-excel.sheet.binary.macroenabled.12",
"application/vnd.ms-outlook",
"application/x-tika-ooxml",
"application/vnd.ms-powerpoint.presentation.macroenabled.12",
"application/vnd.openxmlformats-officedocument.presentationml.template",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
"application/vnd.ms-powerpoint.slideshow.macroenabled.12",
"application/vnd.ms-powerpoint.addin.macroenabled.12",
"application/vnd.ms-excel.sheet.macroenabled.12",
"application/vnd.openxmlformats-officedocument.spreadsheetml.template",
"application/vnd.ms-excel.template.macroenabled.12",
"application/vnd.ms-excel.addin.macroenabled.12",
"application/vnd.ms-word.document.macroenabled.12",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
"application/vnd.ms-word.template.macroenabled.12",
"application/x-asp",
"application/rtf",
"text/plain",
"application/vnd.sun.xml.writer",
"application/vnd.oasis.opendocument.text",
"application/vnd.oasis.opendocument.graphics",
"application/vnd.oasis.opendocument.presentation",
"application/vnd.oasis.opendocument.spreadsheet",
"application/vnd.oasis.opendocument.chart",
"application/vnd.oasis.opendocument.image",
"application/vnd.oasis.opendocument.formula",
"application/vnd.oasis.opendocument.text-master",
"application/vnd.oasis.opendocument.text-web",
"application/vnd.oasis.opendocument.text-template",
"application/vnd.oasis.opendocument.graphics-template",
"application/vnd.oasis.opendocument.presentation-template",
"application/vnd.oasis.opendocument.spreadsheet-template",
"application/vnd.oasis.opendocument.chart-template",
"application/vnd.oasis.opendocument.image-template",
"application/vnd.oasis.opendocument.formula-template",
"application/x-vnd.oasis.opendocument.text",
"application/x-vnd.oasis.opendocument.graphics",
"application/x-vnd.oasis.opendocument.presentation",
"application/x-vnd.oasis.opendocument.spreadsheet",
"application/x-vnd.oasis.opendocument.chart",
"application/x-vnd.oasis.opendocument.image",
"application/x-vnd.oasis.opendocument.formula",
"application/x-vnd.oasis.opendocument.text-master",
"application/x-vnd.oasis.opendocument.text-web",
"application/x-vnd.oasis.opendocument.text-template",
"application/x-vnd.oasis.opendocument.graphics-template",
"application/x-vnd.oasis.opendocument.presentation-template",
"application/x-vnd.oasis.opendocument.spreadsheet-template",
"application/x-vnd.oasis.opendocument.chart-template",
"application/x-vnd.oasis.opendocument.image-template",
"application/x-vnd.oasis.opendocument.formula-template",
"image/bmp",
"image/gif",
"image/jpeg",
"image/png",
"image/tiff",
"image/vnd.wap.wbmp",
"image/x-icon",
"image/x-psd",
"image/x-xcf",
"application/zip",
"application/x-tar",
"application/x-gtar",
"application/x-gzip",
"application/x-bzip",
"application/x-bzip2",
"application/java-vm",
"audio/mpeg",
"application/x-midi",
"audio/midi",
"audio/basic",
"audio/x-wav",
"audio/x-aiff",
"application/mbox",
"text/calendar",
"text/css",
"text/csv",
"text/directory",
"text/dns",
"text/ecmascript",
"text/enriched",
"text/example",
"text/javascript",
"text/parityfec",
"text/prs.fallenstein.rst",
"text/prs.lines.tag",
"text/red",
"text/rfc822-headers",
"text/richtext",
"text/rtf",
"text/rtp-enc-aescm128",
"text/rtx",
"text/sgml",
"text/t140",
"text/tab-separated-values",
"text/troff",
"text/ulpfec",
"text/uri-list",
"text/vnd.abc",
"text/vnd.curl",
"text/vnd.curl.dcurl",
"text/vnd.curl.mcurl",
"text/vnd.curl.scurl",
"text/vnd.dmclientscript",
"text/vnd.esmertec.theme-descriptor",
"text/vnd.fly",
"text/vnd.fmi.flexstor",
"text/vnd.graphviz",
"text/vnd.in3d.3dml",
"text/vnd.in3d.spot",
"text/vnd.iptc.newsml",
"text/vnd.iptc.nitf",
"text/vnd.latex-z",
"text/vnd.motorola.reflex",
"text/vnd.ms-mediapackage",
"text/vnd.net2phone.commcenter.command",
"text/vnd.si.uricatalogue",
"text/vnd.sun.j2me.app-descriptor",
"text/vnd.trolltech.linguist",
"text/vnd.wap.si",
"text/vnd.wap.sl",
"text/vnd.wap.wml",
"text/vnd.wap.wmlscript",
"text/x-asm",
"text/x-c",
"text/x-diff",
"text/x-fortran",
"text/x-java-source",
"text/x-pascal",
"text/x-setext",
"text/x-uuencode",
"text/x-vcalendar",
"text/x-vcard",
"application/x-sh"
            }</arg>
            <arg>tikaExtractor</arg>
        </initMethod>
    </component>
</components>

設定後、通常通りにクロールしてインデックスを生成します。