View Javadoc
1   /*
2    * Copyright 2012-2019 CodeLibs Project and the Others.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13   * either express or implied. See the License for the specific language
14   * governing permissions and limitations under the License.
15   */
16  package org.codelibs.fess.crawler.transformer;
17  
18  import java.net.URLDecoder;
19  import java.util.Arrays;
20  import java.util.Collection;
21  import java.util.Collections;
22  import java.util.HashMap;
23  import java.util.Map;
24  
25  import org.apache.commons.lang3.StringUtils;
26  import org.codelibs.core.collection.LruHashMap;
27  import org.codelibs.core.lang.StringUtil;
28  import org.codelibs.fess.Constants;
29  import org.codelibs.fess.crawler.entity.AccessResult;
30  import org.codelibs.fess.crawler.entity.AccessResultData;
31  import org.codelibs.fess.crawler.entity.UrlQueue;
32  import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
33  import org.codelibs.fess.mylasta.direction.FessConfig;
34  import org.codelibs.fess.util.ComponentUtil;
35  import org.codelibs.fess.util.GroovyUtil;
36  import org.slf4j.Logger;
37  
38  public interface FessTransformer {
39  
40      Map<String, String> parentEncodingMap = Collections.synchronizedMap(new LruHashMap<String, String>(1000));
41  
42      FessConfig getFessConfig();
43  
44      Logger getLogger();
45  
46      default String getHost(final String u) {
47          if (StringUtil.isBlank(u)) {
48              return StringUtil.EMPTY; // empty
49          }
50  
51          String url = u;
52          final String originalUrl = url;
53  
54          int idx = url.indexOf("://");
55          if (idx >= 0) {
56              url = url.substring(idx + 3);
57          }
58  
59          idx = url.indexOf('/');
60          if (idx >= 0) {
61              url = url.substring(0, idx);
62          }
63  
64          if (url.equals(originalUrl)) {
65              return getFessConfig().getCrawlerDocumentUnknownHostname();
66          }
67  
68          return url;
69      }
70  
71      default String getSite(final String u, final String encoding) {
72          if (StringUtil.isBlank(u)) {
73              return StringUtil.EMPTY; // empty
74          }
75  
76          String url = u;
77          int idx = url.indexOf("://");
78          if (idx >= 0) {
79              url = url.substring(idx + 3);
80          }
81  
82          idx = url.indexOf('?');
83          if (idx >= 0) {
84              url = url.substring(0, idx);
85          }
86  
87          if (encoding != null) {
88              String enc;
89              if (StringUtil.isNotBlank(getFessConfig().getCrawlerDocumentSiteEncoding())) {
90                  if (getFessConfig().isCrawlerDocumentUseSiteEncodingOnEnglish()) {
91                      if ("ISO-8859-1".equalsIgnoreCase(encoding) || "US-ASCII".equalsIgnoreCase(encoding)) {
92                          enc = getFessConfig().getCrawlerDocumentSiteEncoding();
93                      } else {
94                          enc = encoding;
95                      }
96                  } else {
97                      enc = getFessConfig().getCrawlerDocumentSiteEncoding();
98                  }
99              } else {
100                 enc = encoding;
101             }
102 
103             try {
104                 url = URLDecoder.decode(url, enc);
105             } catch (final Exception e) {}
106         }
107 
108         return abbreviateSite(url);
109     }
110 
111     default void putResultDataBody(final Map<String, Object> dataMap, final String key, final Object value) {
112         final FessConfig fessConfig = ComponentUtil.getFessConfig();
113         if (fessConfig.getIndexFieldUrl().equals(key)) {
114             dataMap.put(key, value);
115         } else if (dataMap.containsKey(key)) {
116             if (getFessConfig().isCrawlerDocumentAppendData()) {
117                 final Object oldValue = dataMap.get(key);
118                 final Object[] oldValues;
119                 if (oldValue instanceof Object[]) {
120                     oldValues = (Object[]) oldValue;
121                 } else if (oldValue instanceof Collection<?>) {
122                     oldValues = ((Collection<?>) oldValue).toArray();
123                 } else {
124                     oldValues = new Object[] { oldValue };
125                 }
126                 if (value.getClass().isArray()) {
127                     final Object[] newValues = (Object[]) value;
128                     final Object[] values = Arrays.copyOf(oldValues, oldValues.length + newValues.length);
129                     for (int i = 0; i < newValues.length; i++) {
130                         values[values.length - 1 + i] = newValues[i];
131                     }
132                     dataMap.put(key, values);
133                 } else {
134                     final Object[] values = Arrays.copyOf(oldValues, oldValues.length + 1);
135                     values[values.length - 1] = value;
136                     dataMap.put(key, values);
137                 }
138             } else {
139                 dataMap.put(key, value);
140             }
141         } else {
142             dataMap.put(key, value);
143         }
144     }
145 
146     default void putResultDataWithTemplate(final Map<String, Object> dataMap, final String key, final Object value, final String template) {
147         Object target = value;
148         if (template != null) {
149             final Map<String, Object> contextMap = new HashMap<>();
150             contextMap.put("doc", dataMap);
151             final Map<String, Object> paramMap = new HashMap<>(dataMap.size() + 2);
152             paramMap.putAll(dataMap);
153             paramMap.put("value", target);
154             paramMap.put("context", contextMap);
155             target = evaluateValue(template, paramMap);
156         }
157         if (key != null && target != null) {
158             putResultDataBody(dataMap, key, target);
159         }
160     }
161 
162     default Object evaluateValue(final String template, final Map<String, Object> paramMap) {
163         if (StringUtil.isEmpty(template)) {
164             return StringUtil.EMPTY;
165         }
166 
167         return GroovyUtil.evaluate(template, paramMap);
168     }
169 
170     default int getMaxSiteLength() {
171         return getFessConfig().getCrawlerDocumentMaxSiteLengthAsInteger();
172     }
173 
174     default String abbreviateSite(final String value) {
175         final int maxSiteLength = getMaxSiteLength();
176         if (maxSiteLength > -1) {
177             return StringUtils.abbreviate(value, maxSiteLength);
178         } else {
179             return value;
180         }
181     }
182 
183     default String getFileName(final String url, final String encoding) {
184         if (StringUtil.isBlank(url)) {
185             return StringUtil.EMPTY;
186         }
187 
188         String u = url;
189         int idx = u.lastIndexOf('?');
190         if (idx >= 0) {
191             u = u.substring(0, idx);
192         }
193 
194         idx = u.lastIndexOf('#');
195         if (idx >= 0) {
196             u = u.substring(0, idx);
197         }
198         u = decodeUrlAsName(u, u.startsWith("file:"));
199         idx = u.lastIndexOf('/');
200         if (idx >= 0) {
201             if (u.length() > idx + 1) {
202                 u = u.substring(idx + 1);
203             } else {
204                 u = StringUtil.EMPTY;
205             }
206         }
207         return u;
208     }
209 
210     default String decodeUrlAsName(final String url, final boolean escapePlus) {
211         if (url == null) {
212             return null;
213         }
214 
215         final FessConfig fessConfig = getFessConfig();
216         String enc = Constants.UTF_8;
217         if (StringUtil.isBlank(fessConfig.getCrawlerDocumentFileNameEncoding())) {
218             final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
219             if (urlQueue != null) {
220                 final String parentUrl = urlQueue.getParentUrl();
221                 if (StringUtil.isNotEmpty(parentUrl)) {
222                     final String sessionId = urlQueue.getSessionId();
223                     final String pageEnc = getParentEncoding(parentUrl, sessionId);
224                     if (pageEnc != null) {
225                         enc = pageEnc;
226                     } else if (urlQueue.getEncoding() != null) {
227                         enc = urlQueue.getEncoding();
228                     }
229                 }
230             }
231         } else {
232             enc = fessConfig.getCrawlerDocumentFileNameEncoding();
233         }
234 
235         final String escapedUrl = escapePlus ? url.replace("+", "%2B") : url;
236         try {
237             return URLDecoder.decode(escapedUrl, enc);
238         } catch (final Exception e) {
239             return url;
240         }
241     }
242 
243     default String getParentEncoding(final String parentUrl, final String sessionId) {
244         final String key = sessionId + ":" + parentUrl;
245         String enc = parentEncodingMap.get(key);
246         if (enc != null) {
247             return enc;
248         }
249 
250         final AccessResult<?> accessResult = ComponentUtil.getDataService().getAccessResult(sessionId, parentUrl);
251         if (accessResult != null) {
252             final AccessResultData<?> accessResultData = accessResult.getAccessResultData();
253             if (accessResultData != null && accessResultData.getEncoding() != null) {
254                 enc = accessResultData.getEncoding();
255                 parentEncodingMap.put(key, enc);
256                 return enc;
257             }
258         }
259         return null;
260     }
261 }