View Javadoc
1   /*
2    * Copyright 2012-2020 CodeLibs Project and the Others.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13   * either express or implied. See the License for the specific language
14   * governing permissions and limitations under the License.
15   */
16  package org.codelibs.fess.crawler.transformer;
17  
18  import static org.codelibs.core.stream.StreamUtil.stream;
19  
20  import java.io.BufferedInputStream;
21  import java.net.MalformedURLException;
22  import java.net.URL;
23  import java.util.ArrayList;
24  import java.util.Collections;
25  import java.util.Date;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.LinkedHashMap;
29  import java.util.List;
30  import java.util.Locale;
31  import java.util.Map;
32  import java.util.Set;
33  import java.util.stream.Collectors;
34  
35  import javax.annotation.PostConstruct;
36  import javax.xml.transform.TransformerException;
37  
38  import org.apache.logging.log4j.LogManager;
39  import org.apache.logging.log4j.Logger;
40  import org.apache.xpath.objects.XObject;
41  import org.codelibs.core.io.InputStreamUtil;
42  import org.codelibs.core.io.SerializeUtil;
43  import org.codelibs.core.lang.StringUtil;
44  import org.codelibs.core.misc.ValueHolder;
45  import org.codelibs.fess.Constants;
46  import org.codelibs.fess.crawler.builder.RequestDataBuilder;
47  import org.codelibs.fess.crawler.entity.AccessResultData;
48  import org.codelibs.fess.crawler.entity.RequestData;
49  import org.codelibs.fess.crawler.entity.ResponseData;
50  import org.codelibs.fess.crawler.entity.ResultData;
51  import org.codelibs.fess.crawler.entity.UrlQueue;
52  import org.codelibs.fess.crawler.exception.ChildUrlsException;
53  import org.codelibs.fess.crawler.exception.CrawlerSystemException;
54  import org.codelibs.fess.crawler.exception.CrawlingAccessException;
55  import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
56  import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
57  import org.codelibs.fess.es.config.exentity.CrawlingConfig;
58  import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
59  import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config;
60  import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.XPath;
61  import org.codelibs.fess.helper.CrawlingConfigHelper;
62  import org.codelibs.fess.helper.CrawlingInfoHelper;
63  import org.codelibs.fess.helper.DocumentHelper;
64  import org.codelibs.fess.helper.DuplicateHostHelper;
65  import org.codelibs.fess.helper.FileTypeHelper;
66  import org.codelibs.fess.helper.LabelTypeHelper;
67  import org.codelibs.fess.helper.PathMappingHelper;
68  import org.codelibs.fess.helper.SystemHelper;
69  import org.codelibs.fess.mylasta.direction.FessConfig;
70  import org.codelibs.fess.util.ComponentUtil;
71  import org.codelibs.fess.util.PrunedTag;
72  import org.codelibs.nekohtml.parsers.DOMParser;
73  import org.w3c.dom.Document;
74  import org.w3c.dom.NamedNodeMap;
75  import org.w3c.dom.Node;
76  import org.w3c.dom.NodeList;
77  import org.xml.sax.InputSource;
78  
79  public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
80  
81      private static final Logger logger = LogManager.getLogger(FessXpathTransformer.class);
82  
83      private static final String X_ROBOTS_TAG = "X-Robots-Tag";
84  
85      private static final String META_NAME_THUMBNAIL_CONTENT = "//META[@name=\"thumbnail\" or @name=\"THUMBNAIL\"]/@content";
86  
87      private static final String META_PROPERTY_OGIMAGE_CONTENT = "//META[@property=\"og:image\"]/@content";
88  
89      private static final String META_NAME_ROBOTS_CONTENT = "//META[@name=\"robots\" or @name=\"ROBOTS\"]/@content";
90  
91      private static final String ROBOTS_TAG_NONE = "none";
92  
93      private static final String ROBOTS_TAG_NOINDEX = "noindex";
94  
95      private static final String ROBOTS_TAG_NOFOLLOW = "nofollow";
96  
97      private static final int UTF8_BOM_SIZE = 3;
98  
99      public boolean prunedContent = true;
100 
101     public Map<String, String> convertUrlMap = new HashMap<>();
102 
103     protected FessConfig fessConfig;
104 
105     protected boolean useGoogleOffOn = true;
106 
107     protected Map<String, Boolean> fieldPrunedRuleMap = new HashMap<>();
108 
109     @PostConstruct
110     public void init() {
111         if (logger.isDebugEnabled()) {
112             logger.debug("Initialize {}", this.getClass().getSimpleName());
113         }
114         fessConfig = ComponentUtil.getFessConfig();
115     }
116 
117     @Override
118     public FessConfig getFessConfig() {
119         return fessConfig;
120     }
121 
122     @Override
123     public Logger getLogger() {
124         return logger;
125     }
126 
127     @Override
128     protected void storeData(final ResponseData responseData, final ResultData resultData) {
129         final DOMParser parser = getDomParser();
130         try (final BufferedInputStream bis = new BufferedInputStream(responseData.getResponseBody())) {
131             final byte[] bomBytes = new byte[UTF8_BOM_SIZE];
132             bis.mark(UTF8_BOM_SIZE);
133             final int size = bis.read(bomBytes);
134             if (size < 3 || !isUtf8BomBytes(bomBytes)) {
135                 bis.reset();
136             }
137             final InputSource is = new InputSource(bis);
138             if (responseData.getCharSet() != null) {
139                 is.setEncoding(responseData.getCharSet());
140             }
141             parser.parse(is);
142         } catch (final Exception e) {
143             throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
144         }
145 
146         final Document document = parser.getDocument();
147 
148         processMetaRobots(responseData, resultData, document);
149         processXRobotsTag(responseData, resultData);
150 
151         final Map<String, Object> dataMap = new LinkedHashMap<>();
152         for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
153             final String path = entry.getValue();
154             try {
155                 final XObject xObj = getXPathAPI().eval(document, path);
156                 final int type = xObj.getType();
157                 switch (type) {
158                 case XObject.CLASS_BOOLEAN:
159                     final boolean b = xObj.bool();
160                     putResultDataBody(dataMap, entry.getKey(), Boolean.toString(b));
161                     break;
162                 case XObject.CLASS_NUMBER:
163                     final double d = xObj.num();
164                     putResultDataBody(dataMap, entry.getKey(), Double.toString(d));
165                     break;
166                 case XObject.CLASS_STRING:
167                     final String str = xObj.str();
168                     putResultDataBody(dataMap, entry.getKey(), str);
169                     break;
170                 case XObject.CLASS_NULL:
171                 case XObject.CLASS_UNKNOWN:
172                 case XObject.CLASS_NODESET:
173                 case XObject.CLASS_RTREEFRAG:
174                 case XObject.CLASS_UNRESOLVEDVARIABLE:
175                 default:
176                     final Boolean isPruned = fieldPrunedRuleMap.get(entry.getKey());
177                     Node value = getXPathAPI().selectSingleNode(document, entry.getValue());
178                     if (value != null && isPruned != null && isPruned.booleanValue()) {
179                         value = pruneNode(value);
180                     }
181                     putResultDataBody(dataMap, entry.getKey(), value != null ? value.getTextContent() : null);
182                     break;
183                 }
184             } catch (final TransformerException e) {
185                 logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue(), e);
186             }
187         }
188 
189         putAdditionalData(dataMap, responseData, document);
190         normalizeData(responseData, dataMap);
191 
192         try {
193             resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
194         } catch (final Exception e) {
195             throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e);
196         }
197         resultData.setEncoding(charsetName);
198     }
199 
200     protected void normalizeData(final ResponseData responseData, final Map<String, Object> dataMap) {
201         final Object titleObj = dataMap.get(fessConfig.getIndexFieldTitle());
202         if (titleObj != null) {
203             dataMap.put(fessConfig.getIndexFieldTitle(),
204                     ComponentUtil.getDocumentHelper().getTitle(responseData, titleObj.toString(), dataMap));
205         }
206     }
207 
208     protected void processMetaRobots(final ResponseData responseData, final ResultData resultData, final Document document) {
209         final Map<String, String> configMap = getConfigPrameterMap(responseData, ConfigName.CONFIG);
210         final String ignore = configMap.get(Config.IGNORE_ROBOTS_TAGS);
211         if (ignore == null) {
212             if (fessConfig.isCrawlerIgnoreRobotsTags()) {
213                 return;
214             }
215         } else if (Boolean.parseBoolean(ignore)) {
216             return;
217         }
218 
219         // meta tag
220         try {
221             final Node value = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT);
222             if (value != null) {
223                 boolean noindex = false;
224                 boolean nofollow = false;
225                 final String content = value.getTextContent().toLowerCase(Locale.ROOT);
226                 if (content.contains(ROBOTS_TAG_NONE)) {
227                     noindex = true;
228                     nofollow = true;
229                 } else {
230                     if (content.contains(ROBOTS_TAG_NOINDEX)) {
231                         noindex = true;
232                     }
233                     if (content.contains(ROBOTS_TAG_NOFOLLOW)) {
234                         nofollow = true;
235                     }
236                 }
237                 if (noindex && nofollow) {
238                     logger.info("META(robots=noindex,nofollow): " + responseData.getUrl());
239                     throw new ChildUrlsException(Collections.emptySet(), "#processMetaRobots");
240                 } else if (noindex) {
241                     logger.info("META(robots=noindex): " + responseData.getUrl());
242                     storeChildUrls(responseData, resultData);
243                     throw new ChildUrlsException(resultData.getChildUrlSet(), "#processMetaRobots");
244                 } else if (nofollow) {
245                     logger.info("META(robots=nofollow): " + responseData.getUrl());
246                     responseData.setNoFollow(true);
247                 }
248             }
249         } catch (final TransformerException e) {
250             logger.warn("Could not parse a value of " + META_NAME_ROBOTS_CONTENT, e);
251         }
252 
253     }
254 
255     protected void processXRobotsTag(final ResponseData responseData, final ResultData resultData) {
256         final Map<String, String> configMap = getConfigPrameterMap(responseData, ConfigName.CONFIG);
257         final String ignore = configMap.get(Config.IGNORE_ROBOTS_TAGS);
258         if (ignore == null) {
259             if (fessConfig.isCrawlerIgnoreRobotsTags()) {
260                 return;
261             }
262         } else if (Boolean.parseBoolean(ignore)) {
263             return;
264         }
265 
266         // X-Robots-Tag
267         responseData.getMetaDataMap().entrySet().stream().filter(e -> e.getKey().equalsIgnoreCase(X_ROBOTS_TAG) && e.getValue() != null)
268                 .forEach(e -> {
269                     boolean noindex = false;
270                     boolean nofollow = false;
271                     final String value = e.getValue().toString().toLowerCase(Locale.ROOT);
272                     if (value.contains(ROBOTS_TAG_NONE)) {
273                         noindex = true;
274                         nofollow = true;
275                     } else {
276                         if (value.contains(ROBOTS_TAG_NOINDEX)) {
277                             noindex = true;
278                         }
279                         if (value.contains(ROBOTS_TAG_NOFOLLOW)) {
280                             nofollow = true;
281                         }
282                     }
283                     if (noindex && nofollow) {
284                         logger.info("HEADER(robots=noindex,nofollow): " + responseData.getUrl());
285                         throw new ChildUrlsException(Collections.emptySet(), "#processXRobotsTag");
286                     } else if (noindex) {
287                         logger.info("HEADER(robots=noindex): " + responseData.getUrl());
288                         storeChildUrls(responseData, resultData);
289                         throw new ChildUrlsException(resultData.getChildUrlSet(), "#processXRobotsTag");
290                     } else if (nofollow) {
291                         logger.info("HEADER(robots=nofollow): " + responseData.getUrl());
292                         responseData.setNoFollow(true);
293                     }
294                 });
295     }
296 
297     protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
298         final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
299         final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
300         final Map<String, String> configMap = crawlingConfig.getConfigParameterMap(config);
301         return configMap;
302     }
303 
304     protected boolean isValidUrl(final String urlStr) {
305         if (StringUtil.isBlank(urlStr)) {
306             return false;
307         }
308         final String value;
309         if (urlStr.startsWith("://")) {
310             value = "http" + urlStr;
311         } else if (urlStr.startsWith("//")) {
312             value = "http:" + urlStr;
313         } else {
314             value = urlStr;
315         }
316         try {
317             final URL url = new java.net.URL(value);
318             final String host = url.getHost();
319             if (StringUtil.isBlank(host)) {
320                 return false;
321             }
322             if ("http".equalsIgnoreCase(host) || "https".equalsIgnoreCase(host)) {
323                 return false;
324             }
325         } catch (final MalformedURLException e) {
326             return false;
327         }
328         return true;
329     }
330 
331     protected boolean isValidCanonicalUrl(final String url, final String canonicalUrl) {
332         if (url.startsWith("https:") && canonicalUrl.startsWith("http:")) {
333             if (logger.isDebugEnabled()) {
334                 logger.debug("Invalid Canonical Url(https->http): {} -> {}", url, canonicalUrl);
335             }
336             return false;
337         }
338         return true;
339     }
340 
341     protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
342         // canonical
343         final String canonicalUrl = getCanonicalUrl(responseData, document);
344         if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
345                 && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
346             final Set<RequestData> childUrlSet = new HashSet<>();
347             childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
348             logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);
349             throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData");
350         }
351 
352         final FessConfig fessConfig = ComponentUtil.getFessConfig();
353         final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
354         final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
355         final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
356         final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
357         final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
358         final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
359         final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
360         final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
361         final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
362         final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
363         String url = responseData.getUrl();
364         final String indexingTarget = crawlingConfig.getIndexingTarget(url);
365         url = pathMappingHelper.replaceUrl(sessionId, url);
366         final String mimeType = responseData.getMimeType();
367 
368         final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
369         final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
370 
371         String urlEncoding;
372         final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
373         if (urlQueue != null && urlQueue.getEncoding() != null) {
374             urlEncoding = urlQueue.getEncoding();
375         } else {
376             urlEncoding = responseData.getCharSet();
377         }
378 
379         // cid
380         final String configId = crawlingConfig.getConfigId();
381         if (configId != null) {
382             putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
383         }
384         //  expires
385         if (documentExpires != null) {
386             putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
387         }
388         // lang
389         final String lang = systemHelper.normalizeHtmlLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
390         if (lang != null) {
391             putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
392         }
393         // title
394         // content
395         final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
396         putResultDataBody(dataMap, fessConfig.getIndexFieldContent(),
397                 documentHelper.getContent(crawlingConfig, responseData, body, dataMap));
398         if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
399                 .isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
400             if (responseData.getContentLength() > 0
401                     && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
402                 String charSet = responseData.getCharSet();
403                 if (charSet == null) {
404                     charSet = Constants.UTF_8;
405                 }
406                 try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
407                     // cache
408                     putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
409                     putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
410                 } catch (final Exception e) {
411                     logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e);
412                 }
413             } else {
414                 logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(),
415                         fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
416             }
417         }
418         // digest
419         final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false);
420         if (StringUtil.isNotBlank(digest)) {
421             putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
422         } else {
423             putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(),
424                     documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
425         }
426         // segment
427         putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
428         // host
429         putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
430         // site
431         putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
432         // filename
433         final String fileName = getFileName(url, urlEncoding);
434         if (StringUtil.isNotBlank(fileName)) {
435             putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
436         }
437         // url
438         putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
439         // created
440         final Date now = systemHelper.getCurrentTime();
441         putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
442         // anchor
443         putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
444         // mimetype
445         putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
446         if (fileTypeHelper != null) {
447             // filetype
448             putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
449         }
450         // content_length
451         putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
452         // last_modified
453         final Date lastModified = responseData.getLastModified();
454         if (lastModified != null) {
455             putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
456             // timestamp
457             putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
458         } else {
459             // timestamp
460             putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
461         }
462         // indexingTarget
463         putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
464         //  boost
465         putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
466         // label: labelType
467         putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeHelper.getMatchedLabelValueSet(url));
468         // role: roleType
469         final List<String> roleTypeList = new ArrayList<>();
470         stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
471         putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
472         // virtualHosts
473         putResultDataBody(dataMap, fessConfig.getIndexFieldVirtualHost(),
474                 stream(crawlingConfig.getVirtualHosts()).get(stream -> stream.filter(StringUtil::isNotBlank).collect(Collectors.toList())));
475         // id
476         putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
477         // parentId
478         String parentUrl = responseData.getParentUrl();
479         if (StringUtil.isNotBlank(parentUrl)) {
480             parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
481             putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
482             putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
483             putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url); // set again
484         }
485         // thumbnail
486         final String thumbnailUrl = getThumbnailUrl(responseData, document);
487         if (StringUtil.isNotBlank(thumbnailUrl)) {
488             putResultDataBody(dataMap, fessConfig.getIndexFieldThumbnail(), thumbnailUrl);
489         }
490 
491         // from config
492         final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
493         xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> {
494             final String key = e.getKey();
495             final String value = getSingleNodeValue(document, e.getValue(), true);
496             putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key));
497         });
498         crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> {
499             final String key = e.getKey();
500             final String value = e.getValue();
501             putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key));
502         });
503     }
504 
505     protected String getLangXpath(final FessConfig fessConfig, final Map<String, String> xpathConfigMap) {
506         final String xpath = xpathConfigMap.get(XPath.DEFAULT_LANG);
507         if (StringUtil.isNotBlank(xpath)) {
508             return xpath;
509         }
510         return fessConfig.getCrawlerDocumentHtmlLangXpath();
511     }
512 
513     protected String getContentXpath(final FessConfig fessConfig, final Map<String, String> xpathConfigMap) {
514         final String xpath = xpathConfigMap.get(XPath.DEFAULT_CONTENT);
515         if (StringUtil.isNotBlank(xpath)) {
516             return xpath;
517         }
518         return fessConfig.getCrawlerDocumentHtmlContentXpath();
519     }
520 
521     protected String getDigestXpath(final FessConfig fessConfig, final Map<String, String> xpathConfigMap) {
522         final String xpath = xpathConfigMap.get(XPath.DEFAULT_DIGEST);
523         if (StringUtil.isNotBlank(xpath)) {
524             return xpath;
525         }
526         return fessConfig.getCrawlerDocumentHtmlDigestXpath();
527     }
528 
529     protected String getCanonicalUrl(final ResponseData responseData, final Document document) {
530         final Map<String, String> configMap = getConfigPrameterMap(responseData, ConfigName.CONFIG);
531         String xpath = configMap.get(Config.HTML_CANONICAL_XPATH);
532         if (xpath == null) {
533             xpath = fessConfig.getCrawlerDocumentHtmlCanonicalXpath();
534         }
535         if (StringUtil.isBlank(xpath)) {
536             return null;
537         }
538         final String canonicalUrl = getSingleNodeValue(document, xpath, false);
539         if (StringUtil.isBlank(canonicalUrl)) {
540             return null;
541         }
542         return normalizeCanonicalUrl(responseData.getUrl(), canonicalUrl);
543     }
544 
545     protected String normalizeCanonicalUrl(final String baseUrl, final String canonicalUrl) {
546         try {
547             final URL u = new URL(baseUrl);
548             return new URL(u, canonicalUrl.startsWith(":") ? u.getProtocol() + canonicalUrl : canonicalUrl).toString();
549         } catch (final MalformedURLException e) {
550             logger.warn("Invalid canonical url: " + baseUrl + " : " + canonicalUrl, e);
551         }
552         return null;
553     }
554 
555     protected String removeCommentTag(final String content) {
556         if (content == null) {
557             return StringUtil.EMPTY;
558         }
559         String value = content;
560         int pos = value.indexOf("<!--");
561         while (pos >= 0) {
562             final int lastPos = value.indexOf("-->", pos);
563             if (lastPos >= 0) {
564                 if (pos == 0) {
565                     value = " " + value.substring(lastPos + 3);
566                 } else {
567                     value = value.substring(0, pos) + " " + value.substring(lastPos + 3);
568                 }
569             } else {
570                 break;
571             }
572             pos = value.indexOf("<!--");
573         }
574         return value;
575     }
576 
577     protected String getSingleNodeValue(final Document document, final String xpath, final boolean pruned) {
578         StringBuilder buf = null;
579         NodeList list = null;
580         try {
581             list = getXPathAPI().selectNodeList(document, xpath);
582             for (int i = 0; i < list.getLength(); i++) {
583                 if (buf == null) {
584                     buf = new StringBuilder(1000);
585                 }
586                 Node node = list.item(i).cloneNode(true);
587                 if (useGoogleOffOn) {
588                     node = processGoogleOffOn(node, new ValueHolder<>(true));
589                 }
590                 if (pruned) {
591                     node = pruneNode(node);
592                 }
593                 parseTextContent(node, buf);
594             }
595         } catch (final Exception e) {
596             logger.warn("Could not parse a value of " + xpath);
597         }
598         if (buf == null) {
599             return null;
600         }
601         return buf.toString().trim();
602     }
603 
604     protected void parseTextContent(final Node node, final StringBuilder buf) {
605         if (node.hasChildNodes()) {
606             final NodeList nodeList = node.getChildNodes();
607             for (int i = 0; i < nodeList.getLength(); i++) {
608                 final Node childNode = nodeList.item(i);
609                 parseTextContent(childNode, buf);
610             }
611         } else if (node.getNodeType() == Node.TEXT_NODE) {
612             final String value = node.getTextContent();
613             if (value != null) {
614                 final String content = value.trim();
615                 if (content.length() > 0) {
616                     buf.append(' ').append(content);
617                 }
618             }
619         }
620     }
621 
622     protected Node processGoogleOffOn(final Node node, final ValueHolder<Boolean> flag) {
623         final NodeList nodeList = node.getChildNodes();
624         List<Node> removedNodeList = null;
625         for (int i = 0; i < nodeList.getLength(); i++) {
626             final Node childNode = nodeList.item(i);
627             if (childNode.getNodeType() == Node.COMMENT_NODE) {
628                 final String comment = childNode.getNodeValue().trim();
629                 if (comment.startsWith("googleoff:")) {
630                     flag.setValue(false);
631                 } else if (comment.startsWith("googleon:")) {
632                     flag.setValue(true);
633                 }
634             }
635 
636             if (!flag.getValue() && childNode.getNodeType() == Node.TEXT_NODE) {
637                 if (removedNodeList == null) {
638                     removedNodeList = new ArrayList<>();
639                 }
640                 removedNodeList.add(childNode);
641             } else {
642                 processGoogleOffOn(childNode, flag);
643             }
644         }
645 
646         if (removedNodeList != null) {
647             removedNodeList.stream().forEach(n -> node.removeChild(n));
648         }
649 
650         return node;
651     }
652 
653     protected Node pruneNode(final Node node) {
654         final NodeList nodeList = node.getChildNodes();
655         final List<Node> childNodeList = new ArrayList<>();
656         final List<Node> removedNodeList = new ArrayList<>();
657         for (int i = 0; i < nodeList.getLength(); i++) {
658             final Node childNode = nodeList.item(i);
659             if (isPrunedTag(childNode)) {
660                 removedNodeList.add(childNode);
661             } else {
662                 childNodeList.add(childNode);
663             }
664         }
665 
666         for (final Node childNode : removedNodeList) {
667             node.removeChild(childNode);
668         }
669 
670         for (final Node childNode : childNodeList) {
671             pruneNode(childNode);
672         }
673 
674         return node;
675     }
676 
677     protected boolean isPrunedTag(final Node node) {
678         for (final PrunedTag prunedTag : fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray()) {
679             if (prunedTag.matches(node)) {
680                 return true;
681             }
682         }
683         return false;
684     }
685 
686     protected String getMultipleNodeValue(final Document document, final String xpath) {
687         NodeList nodeList = null;
688         final StringBuilder buf = new StringBuilder(100);
689         try {
690             nodeList = getXPathAPI().selectNodeList(document, xpath);
691             for (int i = 0; i < nodeList.getLength(); i++) {
692                 final Node node = nodeList.item(i);
693                 buf.append(node.getTextContent());
694                 buf.append("\n");
695             }
696         } catch (final Exception e) {
697             logger.warn("Could not parse a value of " + xpath, e);
698         }
699         return buf.toString().trim();
700     }
701 
702     protected String replaceDuplicateHost(final String url) {
703         try {
704             // remove duplicate host
705             final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper();
706             return duplicateHostHelper.convert(url);
707         } catch (final Exception e) {
708             return url;
709         }
710     }
711 
712     protected List<String> getAnchorList(final Document document, final ResponseData responseData) {
713         List<RequestData> anchorList = new ArrayList<>();
714         final String baseHref = getBaseHref(document);
715         try {
716             final URL url = getBaseUrl(responseData.getUrl(), baseHref);
717             for (final Map.Entry<String, String> entry : childUrlRuleMap.entrySet()) {
718                 for (final String u : getUrlFromTagAttribute(url, document, entry.getKey(), entry.getValue(), responseData.getCharSet())) {
719                     anchorList.add(RequestDataBuilder.newRequestData().get().url(u).build());
720                 }
721             }
722             anchorList = convertChildUrlList(anchorList);
723         } catch (final Exception e) {
724             logger.warn("Could not parse anchor tags.", e);
725         }
726 
727         final List<String> urlList = new ArrayList<>(anchorList.size());
728         for (final RequestData requestData : anchorList) {
729             urlList.add(requestData.getUrl());
730         }
731         return urlList;
732     }
733 
734     protected URL getBaseUrl(final String currentUrl, final String baseHref) throws MalformedURLException {
735         if (baseHref != null) {
736             return getURL(currentUrl, baseHref);
737         }
738         return new URL(currentUrl);
739     }
740 
741     @Override
742     protected List<RequestData> convertChildUrlList(final List<RequestData> urlList) {
743         if (urlList != null) {
744             for (final RequestData requestData : urlList) {
745                 String url = requestData.getUrl();
746                 for (final Map.Entry<String, String> entry : convertUrlMap.entrySet()) {
747                     url = url.replaceAll(entry.getKey(), entry.getValue());
748                 }
749                 requestData.setUrl(replaceDuplicateHost(url));
750             }
751         }
752         return urlList;
753     }
754 
755     @Override
756     public Object getData(final AccessResultData<?> accessResultData) {
757         final byte[] data = accessResultData.getData();
758         if (data != null) {
759             try {
760                 return SerializeUtil.fromBinaryToObject(data);
761             } catch (final Exception e) {
762                 throw new CrawlerSystemException("Could not create an instanced from bytes.", e);
763             }
764         }
765         return new HashMap<String, Object>();
766     }
767 
768     @Override
769     protected void addChildUrlFromTagAttribute(final List<String> urlList, final URL url, final String attrValue, final String encoding) {
770         final String urlValue = attrValue.trim();
771         URL childUrl;
772         String u = null;
773         try {
774             childUrl = new URL(url, urlValue.startsWith(":") ? url.getProtocol() + urlValue : urlValue);
775             u = encodeUrl(normalizeUrl(childUrl.toExternalForm()), encoding);
776         } catch (final MalformedURLException e) {
777             final int pos = urlValue.indexOf(':');
778             if (pos > 0 && pos < 10) {
779                 u = encodeUrl(normalizeUrl(urlValue), encoding);
780             }
781         }
782 
783         if (u == null) {
784             logger.warn("Ignored child URL: " + attrValue + " in " + url);
785             return;
786         }
787 
788         if (logger.isDebugEnabled()) {
789             logger.debug("{} -> {}", attrValue, u);
790         }
791         if (StringUtil.isNotBlank(u)) {
792             if (logger.isDebugEnabled()) {
793                 logger.debug("Add Child: {}", u);
794             }
795             urlList.add(u);
796         } else {
797             if (logger.isDebugEnabled()) {
798                 logger.debug("Skip Child: {}", u);
799             }
800         }
801     }
802 
803     private boolean isUtf8BomBytes(final byte[] b) {
804         return b[0] == (byte) 0xEF && b[1] == (byte) 0xBB && b[2] == (byte) 0xBF;
805     }
806 
807     public void setUseGoogleOffOn(final boolean useGoogleOffOn) {
808         this.useGoogleOffOn = useGoogleOffOn;
809     }
810 
811     protected String getThumbnailUrl(final ResponseData responseData, final Document document) {
812         // TODO PageMap
813         try {
814             // meta thumbnail
815             final Node thumbnailNode = getXPathAPI().selectSingleNode(document, META_NAME_THUMBNAIL_CONTENT);
816             if (thumbnailNode != null) {
817                 final String content = thumbnailNode.getTextContent();
818                 if (StringUtil.isNotBlank(content)) {
819                     final URL thumbnailUrl = getURL(responseData.getUrl(), content);
820                     if (thumbnailUrl != null) {
821                         return thumbnailUrl.toExternalForm();
822                     }
823                 }
824             }
825 
826             // meta og:image
827             final Node ogImageNode = getXPathAPI().selectSingleNode(document, META_PROPERTY_OGIMAGE_CONTENT);
828             if (ogImageNode != null) {
829                 final String content = ogImageNode.getTextContent();
830                 if (StringUtil.isNotBlank(content)) {
831                     final URL thumbnailUrl = getURL(responseData.getUrl(), content);
832                     if (thumbnailUrl != null) {
833                         return thumbnailUrl.toExternalForm();
834                     }
835                 }
836             }
837 
838             final NodeList imgNodeList = getXPathAPI().selectNodeList(document, fessConfig.getThumbnailHtmlImageXpath());
839             String firstThumbnailUrl = null;
840             for (int i = 0; i < imgNodeList.getLength(); i++) {
841                 final Node imgNode = imgNodeList.item(i);
842                 if (logger.isDebugEnabled()) {
843                     logger.debug("img tag: {}", imgNode);
844                 }
845                 final NamedNodeMap attributes = imgNode.getAttributes();
846                 final String thumbnailUrl = getThumbnailSrc(responseData.getUrl(), attributes);
847                 final Integer height = getAttributeAsInteger(attributes, "height");
848                 final Integer width = getAttributeAsInteger(attributes, "width");
849                 if (!fessConfig.isThumbnailHtmlImageUrl(thumbnailUrl)) {
850                     continue;
851                 } else if (height != null && width != null) {
852                     try {
853                         if (fessConfig.validateThumbnailSize(width, height)) {
854                             return thumbnailUrl;
855                         }
856                     } catch (final Exception e) {
857                         logger.debug("Failed to parse {} at {}", imgNode, responseData.getUrl(), e);
858                     }
859                 } else if (firstThumbnailUrl == null) {
860                     firstThumbnailUrl = thumbnailUrl;
861                 }
862             }
863 
864             if (firstThumbnailUrl != null) {
865                 return firstThumbnailUrl;
866             }
867         } catch (final Exception e) {
868             logger.warn("Failed to retrieve thumbnail url from " + responseData.getUrl(), e);
869         }
870         return null;
871     }
872 
873     protected String getThumbnailSrc(final String url, final NamedNodeMap attributes) {
874         final Node srcNode = attributes.getNamedItem("src");
875         if (srcNode != null) {
876             try {
877                 final URL thumbnailUrl = getURL(url, srcNode.getTextContent());
878                 if (thumbnailUrl != null) {
879                     return thumbnailUrl.toExternalForm();
880                 }
881             } catch (final Exception e) {
882                 if (logger.isDebugEnabled()) {
883                     logger.debug("Failed to parse thumbnail url for {} : {}", url, attributes, e);
884                 }
885             }
886         }
887         return null;
888     }
889 
890     protected Integer getAttributeAsInteger(final NamedNodeMap attributes, final String name) {
891         final Node namedItem = attributes.getNamedItem(name);
892         if (namedItem == null) {
893             return null;
894         }
895         final String value = namedItem.getTextContent();
896         if (value == null) {
897             return null;
898         }
899         try {
900             return Integer.parseInt(value);
901         } catch (final NumberFormatException e) {
902             if (value.endsWith("%") || value.endsWith("px")) {
903                 return null;
904             }
905             return 0;
906         }
907     }
908 
909     protected URL getURL(final String currentUrl, final String url) throws MalformedURLException {
910         if (url != null) {
911             if (url.startsWith("://")) {
912                 final String protocol = currentUrl.split(":")[0];
913                 return new URL(protocol + url);
914             } else if (url.startsWith("//")) {
915                 final String protocol = currentUrl.split(":")[0];
916                 return new URL(protocol + ":" + url);
917             } else if (url.startsWith("/") || url.indexOf(':') == -1) {
918                 return new URL(new URL(currentUrl), url);
919             }
920             return new URL(url);
921         }
922         return null;
923     }
924 
925     public void addFieldRule(final String name, final String xpath, final boolean isPruned) {
926         addFieldRule(name, xpath);
927         fieldPrunedRuleMap.put(name, isPruned);
928     }
929 }