Class FessXpathTransformer

  • All Implemented Interfaces:
    FessTransformer, org.codelibs.fess.crawler.transformer.Transformer

    public class FessXpathTransformer
    extends org.codelibs.fess.crawler.transformer.impl.XpathTransformer
    implements FessTransformer
    • Field Summary

      Fields 
      Modifier and Type Field Description
      java.util.Map<java.lang.String,​java.lang.String> convertUrlMap  
      protected FessConfig fessConfig  
      protected java.util.Map<java.lang.String,​java.lang.Boolean> fieldPrunedRuleMap  
      boolean prunedContent  
      protected boolean useGoogleOffOn  
      • Fields inherited from class org.codelibs.fess.crawler.transformer.impl.XpathTransformer

        charsetName, dataClass, fieldRuleMap, trimSpace
      • Fields inherited from class org.codelibs.fess.crawler.transformer.impl.HtmlTransformer

        childUrlRuleMap, crawlerContainer, defaultEncoding, featureMap, invalidUrlPattern, LOCATION_HEADER, preloadSizeForCharset, propertyMap
      • Fields inherited from class org.codelibs.fess.crawler.transformer.impl.AbstractTransformer

        name
    • Method Summary

      All Methods Instance Methods Concrete Methods 
      Modifier and Type Method Description
      protected void addChildUrlFromTagAttribute​(java.util.List<java.lang.String> urlList, java.net.URL url, java.lang.String attrValue, java.lang.String encoding)  
      void addFieldRule​(java.lang.String name, java.lang.String xpath, boolean isPruned)  
      protected java.util.List<org.codelibs.fess.crawler.entity.RequestData> convertChildUrlList​(java.util.List<org.codelibs.fess.crawler.entity.RequestData> urlList)  
      protected java.util.List<java.lang.String> getAnchorList​(org.w3c.dom.Document document, org.codelibs.fess.crawler.entity.ResponseData responseData)  
      protected java.lang.Integer getAttributeAsInteger​(org.w3c.dom.NamedNodeMap attributes, java.lang.String name)  
      protected java.net.URL getBaseUrl​(java.lang.String currentUrl, java.lang.String baseHref)  
      protected java.lang.String getCanonicalUrl​(org.codelibs.fess.crawler.entity.ResponseData responseData, org.w3c.dom.Document document)  
      protected java.util.Map<java.lang.String,​java.lang.String> getConfigPrameterMap​(org.codelibs.fess.crawler.entity.ResponseData responseData, CrawlingConfig.ConfigName config)  
      protected java.lang.String getContentXpath​(FessConfig fessConfig, java.util.Map<java.lang.String,​java.lang.String> xpathConfigMap)  
      java.lang.Object getData​(org.codelibs.fess.crawler.entity.AccessResultData<?> accessResultData)  
      protected java.lang.String getDigestXpath​(FessConfig fessConfig, java.util.Map<java.lang.String,​java.lang.String> xpathConfigMap)  
      FessConfig getFessConfig()  
      protected java.lang.String getLangXpath​(FessConfig fessConfig, java.util.Map<java.lang.String,​java.lang.String> xpathConfigMap)  
      org.slf4j.Logger getLogger()  
      protected java.lang.String getMultipleNodeValue​(org.w3c.dom.Document document, java.lang.String xpath)  
      protected java.lang.String getSingleNodeValue​(org.w3c.dom.Document document, java.lang.String xpath, boolean pruned)  
      protected java.lang.String getThumbnailSrc​(java.lang.String url, org.w3c.dom.NamedNodeMap attributes)  
      protected java.lang.String getThumbnailUrl​(org.codelibs.fess.crawler.entity.ResponseData responseData, org.w3c.dom.Document document)  
      protected java.net.URL getURL​(java.lang.String currentUrl, java.lang.String url)  
      void init()  
      protected boolean isPrunedTag​(org.w3c.dom.Node node)  
      protected boolean isValidCanonicalUrl​(java.lang.String url, java.lang.String canonicalUrl)  
      protected boolean isValidUrl​(java.lang.String urlStr)  
      protected java.lang.String normalizeCanonicalUrl​(java.lang.String baseUrl, java.lang.String canonicalUrl)  
      protected void normalizeData​(org.codelibs.fess.crawler.entity.ResponseData responseData, java.util.Map<java.lang.String,​java.lang.Object> dataMap)  
      protected void parseTextContent​(org.w3c.dom.Node node, java.lang.StringBuilder buf)  
      protected org.w3c.dom.Node processGoogleOffOn​(org.w3c.dom.Node node, org.codelibs.core.misc.ValueHolder<java.lang.Boolean> flag)  
      protected void processMetaRobots​(org.codelibs.fess.crawler.entity.ResponseData responseData, org.codelibs.fess.crawler.entity.ResultData resultData, org.w3c.dom.Document document)  
      protected void processXRobotsTag​(org.codelibs.fess.crawler.entity.ResponseData responseData, org.codelibs.fess.crawler.entity.ResultData resultData)  
      protected org.w3c.dom.Node pruneNode​(org.w3c.dom.Node node)  
      protected void putAdditionalData​(java.util.Map<java.lang.String,​java.lang.Object> dataMap, org.codelibs.fess.crawler.entity.ResponseData responseData, org.w3c.dom.Document document)  
      protected java.lang.String removeCommentTag​(java.lang.String content)  
      protected java.lang.String replaceDuplicateHost​(java.lang.String url)  
      void setUseGoogleOffOn​(boolean useGoogleOffOn)  
      protected void storeData​(org.codelibs.fess.crawler.entity.ResponseData responseData, org.codelibs.fess.crawler.entity.ResultData resultData)  
      • Methods inherited from class org.codelibs.fess.crawler.transformer.impl.XpathTransformer

        addFieldRule, getAdditionalData, getCharsetName, getDataClass, getFieldRuleMap, getResultDataBody, getResultDataBody, getResultDataFooter, getResultDataHeader, isTrimSpace, setCharsetName, setDataClass, setFieldRuleMap, setTrimSpace, trimSpace
      • Methods inherited from class org.codelibs.fess.crawler.transformer.impl.HtmlTransformer

        addChildUrlRule, addFeature, addProperty, encodeUrl, getBaseHref, getChildUrlRuleMap, getDefaultEncoding, getDomParser, getDuplicateUrl, getFeatureMap, getInvalidUrlPattern, getPreloadSizeForCharset, getPropertyMap, getUrlFromTagAttribute, getXPathAPI, isHtml, isSupportedCharset, isValidPath, loadCharset, normalizeEncoding, normalizeUrl, parseCharset, setChildUrlRuleMap, setDefaultEncoding, setFeatureMap, setInvalidUrlPattern, setPreloadSizeForCharset, setPropertyMap, storeChildUrls, transform, updateCharset
      • Methods inherited from class org.codelibs.fess.crawler.transformer.impl.AbstractTransformer

        getName, setName
      • Methods inherited from class java.lang.Object

        clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
    • Field Detail

      • prunedContent

        public boolean prunedContent
      • convertUrlMap

        public java.util.Map<java.lang.String,​java.lang.String> convertUrlMap
      • useGoogleOffOn

        protected boolean useGoogleOffOn
      • fieldPrunedRuleMap

        protected java.util.Map<java.lang.String,​java.lang.Boolean> fieldPrunedRuleMap
    • Constructor Detail

      • FessXpathTransformer

        public FessXpathTransformer()
    • Method Detail

      • init

        @PostConstruct
        public void init()
      • storeData

        protected void storeData​(org.codelibs.fess.crawler.entity.ResponseData responseData,
                                 org.codelibs.fess.crawler.entity.ResultData resultData)
        Overrides:
        storeData in class org.codelibs.fess.crawler.transformer.impl.XpathTransformer
      • normalizeData

        protected void normalizeData​(org.codelibs.fess.crawler.entity.ResponseData responseData,
                                     java.util.Map<java.lang.String,​java.lang.Object> dataMap)
      • processMetaRobots

        protected void processMetaRobots​(org.codelibs.fess.crawler.entity.ResponseData responseData,
                                         org.codelibs.fess.crawler.entity.ResultData resultData,
                                         org.w3c.dom.Document document)
      • processXRobotsTag

        protected void processXRobotsTag​(org.codelibs.fess.crawler.entity.ResponseData responseData,
                                         org.codelibs.fess.crawler.entity.ResultData resultData)
      • getConfigPrameterMap

        protected java.util.Map<java.lang.String,​java.lang.String> getConfigPrameterMap​(org.codelibs.fess.crawler.entity.ResponseData responseData,
                                                                                              CrawlingConfig.ConfigName config)
      • isValidUrl

        protected boolean isValidUrl​(java.lang.String urlStr)
      • isValidCanonicalUrl

        protected boolean isValidCanonicalUrl​(java.lang.String url,
                                              java.lang.String canonicalUrl)
      • putAdditionalData

        protected void putAdditionalData​(java.util.Map<java.lang.String,​java.lang.Object> dataMap,
                                         org.codelibs.fess.crawler.entity.ResponseData responseData,
                                         org.w3c.dom.Document document)
      • getLangXpath

        protected java.lang.String getLangXpath​(FessConfig fessConfig,
                                                java.util.Map<java.lang.String,​java.lang.String> xpathConfigMap)
      • getContentXpath

        protected java.lang.String getContentXpath​(FessConfig fessConfig,
                                                   java.util.Map<java.lang.String,​java.lang.String> xpathConfigMap)
      • getDigestXpath

        protected java.lang.String getDigestXpath​(FessConfig fessConfig,
                                                  java.util.Map<java.lang.String,​java.lang.String> xpathConfigMap)
      • getCanonicalUrl

        protected java.lang.String getCanonicalUrl​(org.codelibs.fess.crawler.entity.ResponseData responseData,
                                                   org.w3c.dom.Document document)
      • normalizeCanonicalUrl

        protected java.lang.String normalizeCanonicalUrl​(java.lang.String baseUrl,
                                                         java.lang.String canonicalUrl)
      • removeCommentTag

        protected java.lang.String removeCommentTag​(java.lang.String content)
      • getSingleNodeValue

        protected java.lang.String getSingleNodeValue​(org.w3c.dom.Document document,
                                                      java.lang.String xpath,
                                                      boolean pruned)
      • parseTextContent

        protected void parseTextContent​(org.w3c.dom.Node node,
                                        java.lang.StringBuilder buf)
      • processGoogleOffOn

        protected org.w3c.dom.Node processGoogleOffOn​(org.w3c.dom.Node node,
                                                      org.codelibs.core.misc.ValueHolder<java.lang.Boolean> flag)
      • pruneNode

        protected org.w3c.dom.Node pruneNode​(org.w3c.dom.Node node)
      • isPrunedTag

        protected boolean isPrunedTag​(org.w3c.dom.Node node)
      • getMultipleNodeValue

        protected java.lang.String getMultipleNodeValue​(org.w3c.dom.Document document,
                                                        java.lang.String xpath)
      • replaceDuplicateHost

        protected java.lang.String replaceDuplicateHost​(java.lang.String url)
      • getAnchorList

        protected java.util.List<java.lang.String> getAnchorList​(org.w3c.dom.Document document,
                                                                 org.codelibs.fess.crawler.entity.ResponseData responseData)
      • getBaseUrl

        protected java.net.URL getBaseUrl​(java.lang.String currentUrl,
                                          java.lang.String baseHref)
                                   throws java.net.MalformedURLException
        Throws:
        java.net.MalformedURLException
      • convertChildUrlList

        protected java.util.List<org.codelibs.fess.crawler.entity.RequestData> convertChildUrlList​(java.util.List<org.codelibs.fess.crawler.entity.RequestData> urlList)
        Overrides:
        convertChildUrlList in class org.codelibs.fess.crawler.transformer.impl.HtmlTransformer
      • getData

        public java.lang.Object getData​(org.codelibs.fess.crawler.entity.AccessResultData<?> accessResultData)
        Specified by:
        getData in interface org.codelibs.fess.crawler.transformer.Transformer
        Overrides:
        getData in class org.codelibs.fess.crawler.transformer.impl.XpathTransformer
      • addChildUrlFromTagAttribute

        protected void addChildUrlFromTagAttribute​(java.util.List<java.lang.String> urlList,
                                                   java.net.URL url,
                                                   java.lang.String attrValue,
                                                   java.lang.String encoding)
        Overrides:
        addChildUrlFromTagAttribute in class org.codelibs.fess.crawler.transformer.impl.HtmlTransformer
      • setUseGoogleOffOn

        public void setUseGoogleOffOn​(boolean useGoogleOffOn)
      • getThumbnailUrl

        protected java.lang.String getThumbnailUrl​(org.codelibs.fess.crawler.entity.ResponseData responseData,
                                                   org.w3c.dom.Document document)
      • getThumbnailSrc

        protected java.lang.String getThumbnailSrc​(java.lang.String url,
                                                   org.w3c.dom.NamedNodeMap attributes)
      • getAttributeAsInteger

        protected java.lang.Integer getAttributeAsInteger​(org.w3c.dom.NamedNodeMap attributes,
                                                          java.lang.String name)
      • getURL

        protected java.net.URL getURL​(java.lang.String currentUrl,
                                      java.lang.String url)
                               throws java.net.MalformedURLException
        Throws:
        java.net.MalformedURLException
      • addFieldRule

        public void addFieldRule​(java.lang.String name,
                                 java.lang.String xpath,
                                 boolean isPruned)